From 40d271ea668c8ff113044d33f151cf4db300777a Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Tue, 12 Sep 2023 04:28:58 -0700 Subject: [PATCH 001/115] commoncrawl Signed-off-by: Nikolay Karpov --- dataset_configs/commoncrawl/small.yaml | 80 ++ dataset_configs/commoncrawl/small_de.yaml | 172 ++++ dataset_configs/commoncrawl/small_de_en.yaml | 159 ++++ dataset_configs/commoncrawl/small_en.yaml | 202 +++++ dataset_configs/commoncrawl/small_es.yaml | 187 ++++ dataset_configs/commoncrawl/small_fr.yaml | 169 ++++ dataset_configs/commoncrawl/small_pl.yaml | 175 ++++ .../commoncrawl/small_sentence.yaml | 72 ++ .../datasets/commoncrawl/__init__.py | 15 + .../datasets/commoncrawl/commoncrawl.py | 645 ++++++++++++++ .../datasets/commoncrawl/harv_utils.py | 825 ++++++++++++++++++ .../datasets/commoncrawl/requirements.txt | 7 + 12 files changed, 2708 insertions(+) create mode 100644 dataset_configs/commoncrawl/small.yaml create mode 100644 dataset_configs/commoncrawl/small_de.yaml create mode 100644 dataset_configs/commoncrawl/small_de_en.yaml create mode 100644 dataset_configs/commoncrawl/small_en.yaml create mode 100644 dataset_configs/commoncrawl/small_es.yaml create mode 100644 dataset_configs/commoncrawl/small_fr.yaml create mode 100644 dataset_configs/commoncrawl/small_pl.yaml create mode 100644 dataset_configs/commoncrawl/small_sentence.yaml create mode 100644 sdp/processors/datasets/commoncrawl/__init__.py create mode 100644 sdp/processors/datasets/commoncrawl/commoncrawl.py create mode 100644 sdp/processors/datasets/commoncrawl/harv_utils.py create mode 100644 sdp/processors/datasets/commoncrawl/requirements.txt diff --git a/dataset_configs/commoncrawl/small.yaml b/dataset_configs/commoncrawl/small.yaml new file mode 100644 index 00000000..a261dd39 --- /dev/null +++ b/dataset_configs/commoncrawl/small.yaml @@ -0,0 +1,80 @@ +processors_to_run: "9:" +workspace_dir: /mnt/ssd8/cc_sdp +final_manifest: ${workspace_dir}/full_manifest.json +group_duration_threshold: 20.0 + +processors: + - _target_: sdp.processors.datasets.commoncrawl.CreateInitialManifestCC + raw_data_dir: /mnt/ssd8/cc_download/videos_and_transcripts_large_16_cores_output/video_output2 + output_manifest_file: ${workspace_dir}/manifest0.json + resampled_audio_dir: ${workspace_dir}/audio/ + target_samplerate: 16000 + target_nchannels: 1 + audio_field: "audios" + video_field: "videos" + key_field: "key" + text_field: "texts" + + - _target_: sdp.processors.datasets.commoncrawl.TxtToVtt + input_manifest_file: ${workspace_dir}/manifest0.json + output_manifest_file: ${workspace_dir}/manifest1.json + vtt_files_dir: ${workspace_dir}/vtts/ + key_field: "key" + text_field: "texts" + vtt_field: "vtt_filepath" + + - _target_: sdp.processors.datasets.commoncrawl.AllVttText + input_manifest_file: ${workspace_dir}/manifest1.json + output_manifest_file: ${workspace_dir}/manifest2.json + input_filepath_field: vtt_filepath + output_text_field: vtt_text + + - _target_: sdp.processors.datasets.commoncrawl.TextLid + input_manifest_file: ${workspace_dir}/manifest2.json + output_manifest_file: ${workspace_dir}/manifest3.json + input_text_field: vtt_text + output_lang_field: text_lang + device: cuda + pretrained_model: "jb2k/bert-base-multilingual-cased-language-detection" + + - _target_: sdp.processors.datasets.commoncrawl.Lang2Iso + input_manifest_file: ${workspace_dir}/manifest3.json + output_manifest_file: ${workspace_dir}/manifest4.json + input_lang_field: text_lang + output_lang_field: text_lang + + - _target_: sdp.processors.datasets.commoncrawl.AudioLid + input_manifest_file: ${workspace_dir}/manifest4.json + output_manifest_file: ${workspace_dir}/manifest5.json + input_audio_field: audios + output_lang_field: audio_lang + device: cuda + pretrained_model: "langid_ambernet" + + - _target_: sdp.processors.datasets.commoncrawl.SplitByVtt + input_manifest_file: ${workspace_dir}/manifest5.json + output_manifest_file: ${workspace_dir}/manifest6.json + splited_audio_dir: ${workspace_dir}/splited/ + source_audio_field: audios + audio_lang_field: audio_lang + text_lang_field: text_lang + key_field: "key" + target_audio_field: "audio_filepath" + duration_field: "durations" + text_field: "text" + vtt_field: "vtt_filepath" + + - _target_: sdp.processors.RenameFields + input_manifest_file: ${workspace_dir}/manifest6.json + rename_fields: {"durations": duration} + + - _target_: sdp.processors.DropHighLowDuration + output_manifest_file: ${workspace_dir}/manifest7.json + high_duration_threshold: 40 + low_duration_threshold: 0.2 + + - _target_: sdp.processors.datasets.commoncrawl.ReadParquet + raw_data_dir: /mnt/ssd8/cc_download/videos_and_transcripts_large_16_cores_output/video_output2 + output_manifest_file: ${workspace_dir}/manifest8.json + output_text_field: url + key_field: key \ No newline at end of file diff --git a/dataset_configs/commoncrawl/small_de.yaml b/dataset_configs/commoncrawl/small_de.yaml new file mode 100644 index 00000000..ce8b9d27 --- /dev/null +++ b/dataset_configs/commoncrawl/small_de.yaml @@ -0,0 +1,172 @@ +processors_to_run: "3:" +workspace_dir: /mnt/ssd8/cc_sdp/de # ü ä ö ß Ä Ö Ü + +processors: + - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue + input_manifest_file: /mnt/ssd8/cc_sdp/manifest8ps.json + output_manifest_file: ${workspace_dir}/manifest0.json + input_field: audio_lang + target_value: de + + - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue + output_manifest_file: ${workspace_dir}/manifest1.json + input_field: text_lang + target_value: de + + - _target_: sdp.processors.ASRInference + output_manifest_file: ${workspace_dir}/manifest2.json + pretrained_model: nvidia/stt_de_fastconformer_hybrid_large_pc + batch_size: 64 + + - _target_: sdp.processors.DropIfRegexMatch + output_manifest_file: ${workspace_dir}/manifest3.json + max_workers: 20 + regex_patterns: + # - '://' + # - '\\x' + - 'é' + - 'ô' + - '×' + - 'š' + - '\u202a' + - 'č' + - 'ć' + - 'á' + - 'ã' + - 'â' + - 'ï' + - '\u2060' + - 'ñ' + - 'ŵ' + - 'à' + - 'ù' + - 'ò' + - 'ó' + - 'ő' + - 'ê' + - 'ă' + - 'ú' + - 'µ' + - '¿' + - 'ë' + - "è" + - "é" + - "È" + - "É" + - "%" + - "¡" + + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest4.json + duplicate_fields: {"text":"orig_text"} + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest5.json + regex_params_list: + - {"pattern": 'î', "repl": "i"} + - {"pattern": 'ì', "repl": "i"} + - {"pattern": 'í', "repl": "i"} + - {"pattern": '‚', "repl": ","} + - {"pattern": "’", "repl": "'"} + - {"pattern": "[-–—]", "repl": " "} + - {"pattern": '―', "repl": "-"} + - {"pattern": '—', "repl": "-"} + - {"pattern": '⁺', "repl": "+"} + - {"pattern": '“', "repl": '"'} + - {"pattern": '”', "repl": '"'} + - {"pattern": '…', "repl": '.'} + - {"pattern": '‘', "repl": "'"} + - {"pattern": '′', "repl": "'"} + - {"pattern": '`', "repl": "'"} + - {"pattern": '⁻', "repl": "-"} + - {"pattern": '‑', "repl": "-"} + - {"pattern": '¶', "repl": ' '} + - {"pattern": '«', "repl": '"'} + - {"pattern": '»', "repl": '"'} + - {"pattern": '„', "repl": '"'} + - {"pattern": '®', "repl": ' '} + - {"pattern": " ", "repl": " "} + + - _target_: sdp.processors.DropHighLowWordrate + output_manifest_file: ${workspace_dir}/manifest6.json + high_wordrate_threshold: 100 + low_wordrate_threshold: 0.01 + + - _target_: sdp.processors.datasets.commoncrawl.Subprocess + output_manifest_file: ${workspace_dir}/manifest7.json + input_manifest_arg: "--manifest" + output_manifest_arg: "--output_filename" + arg_separator: "=" + cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \ + --language=de --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache \ + --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/de/data/whitelist.tsv" + + - _target_: sdp.processors.RenameFields + output_manifest_file: ${workspace_dir}/manifest8.json + rename_fields: {"normalized":"text"} + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest9.json + text_key: text + regex_params_list: + - {"pattern": '\\.{3}', "repl": '.'} + - {"pattern": '\$', "repl": ""} + - {"pattern": "'", "repl": " "} + - {"pattern": "[^a-zA-ZäöüÄÖÜß.,?]", "repl": " "} + - {"pattern": ' ', "repl": " "} + + - _target_: sdp.processors.DropHighLowWordrate + output_manifest_file: ${workspace_dir}/manifest10.json + high_wordrate_threshold: 100 + low_wordrate_threshold: 0.01 + + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest11.json + duplicate_fields: {"text":"text_pc"} + + - _target_: sdp.processors.SubMakeLowercase + output_manifest_file: ${workspace_dir}/manifest12.json + text_key: text + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest13.json + text_key: text + regex_params_list: + - {"pattern": "[\\?\\.\\!]", "repl": " "} + - {"pattern": ",", "repl": " "} + - {"pattern": " ", "repl": " "} + + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest14.json + duplicate_fields: {"pred_text":"pred_text_pc"} + + - _target_: sdp.processors.SubMakeLowercase + output_manifest_file: ${workspace_dir}/manifest15.json + text_key: pred_text + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest16.json + text_key: pred_text + regex_params_list: + - {"pattern": "[\\?\\.\\!]", "repl": " "} + - {"pattern": ",", "repl": " "} + - {"pattern": " ", "repl": " "} + + - _target_: sdp.processors.DropIfRegexMatch + output_manifest_file: ${workspace_dir}/manifest17.json + text_key: text + regex_patterns: + - "^\\s*$" + + - _target_: sdp.processors.DropHighWER + output_manifest_file: ${workspace_dir}/manifest18.json + text_key: text + pred_text_key: pred_text + wer_threshold: 75 + + - _target_: sdp.processors.DropHighCER + output_manifest_file: ${workspace_dir}/manifest19.json + text_key: text + pred_text_key: pred_text + cer_threshold: 30 + \ No newline at end of file diff --git a/dataset_configs/commoncrawl/small_de_en.yaml b/dataset_configs/commoncrawl/small_de_en.yaml new file mode 100644 index 00000000..cedb8f2e --- /dev/null +++ b/dataset_configs/commoncrawl/small_de_en.yaml @@ -0,0 +1,159 @@ +processors_to_run: "13" +workspace_dir: /mnt/ssd8/cc_sdp/de_en +NEMO_GIT_FOLDER: /home/nkarpov/workspace/NeMo + +processors: + - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue + input_manifest_file: /mnt/ssd8/cc_sdp/manifest8ps.json + output_manifest_file: ${workspace_dir}/manifest0.json + input_field: audio_lang + target_value: de + + - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue + output_manifest_file: ${workspace_dir}/manifest1.json + input_field: text_lang + target_value: en + + - _target_: sdp.processors.ASRInference + output_manifest_file: ${workspace_dir}/manifest2.json + pretrained_model: nvidia/stt_de_fastconformer_hybrid_large_pc + batch_size: 64 + + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest3.json + duplicate_fields: {"text":"orig_text"} + + - _target_: sdp.processors.RenameFields + output_manifest_file: ${workspace_dir}/manifest4.json + rename_fields: {"pred_text": "asr_text"} + + - _target_: sdp.processors.DropIfRegexMatch + output_manifest_file: ${workspace_dir}/manifest5.json + text_key: asr_text + regex_patterns: + - "^\\s*$" + + - _target_: sdp.processors.datasets.commoncrawl.NmtSubprocess + output_manifest_file: ${workspace_dir}/manifest6.json + arg_separator: "=" + srctext_file: ${workspace_dir}/srctext.txt # --srctext=${workspace_dir}/srctext.txt + tgtout_file: ${workspace_dir}/tgtout.txt # --tgtout=${workspace_dir}/tgtout.txt + input_field: "asr_text" + output_field: "pred_text" + cmd: "python /home/nkarpov/workspace/NeMo/examples/nlp/machine_translation/nmt_transformer_infer.py \ + --model=${workspace_dir}/nmt_de_en_transformer12x2.nemo --target_lang=en --source_lang=de" + + - _target_: sdp.processors.DropIfRegexMatch + output_manifest_file: ${workspace_dir}/manifest7.json + text_key: text + regex_patterns: + # - '://' + - '\\x' + - 'é' + - 'ô' + - '×' + - 'š' + - '\u202a' + - 'č' + - 'ć' + - 'á' + - 'ã' + - 'â' + - 'ï' + - '\u2060' + - '°' + - 'ñ' + - 'ŵ' + - 'à' + - 'ù' + - 'ò' + - 'ó' + - 'ő' + - 'ê' + - 'ă' + - 'ú' + - 'µ' + - '¿' + - "è" + - "é" + - "È" + - "É" + - "¡" + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest8.json + regex_params_list: + - {"pattern": '\[(.*?)\]', "repl": ' '} + - {"pattern": 'î', "repl": "i"} + - {"pattern": 'ì', "repl": "i"} + - {"pattern": 'í', "repl": "i"} + - {"pattern": '‚', "repl": ","} + - {"pattern": "’", "repl": "'"} + - {"pattern": "[-–—]", "repl": " "} + - {"pattern": '―', "repl": "-"} + - {"pattern": '—', "repl": "-"} + - {"pattern": '⁺', "repl": "+"} + - {"pattern": '“', "repl": '"'} + - {"pattern": '”', "repl": '"'} + - {"pattern": '…', "repl": '.'} + - {"pattern": '‘', "repl": "'"} + - {"pattern": '′', "repl": "'"} + - {"pattern": '`', "repl": "'"} + - {"pattern": '⁻', "repl": "-"} + - {"pattern": '‑', "repl": "-"} + - {"pattern": '¶', "repl": ' '} + - {"pattern": '«', "repl": '"'} + - {"pattern": '»', "repl": '"'} + - {"pattern": '„', "repl": '"'} + - {"pattern": '®', "repl": ' '} + - {"pattern": " ", "repl": " "} + + - _target_: sdp.processors.DropHighLowWordrate + output_manifest_file: ${workspace_dir}/manifest9.json + high_wordrate_threshold: 100 + low_wordrate_threshold: 0.01 + + - _target_: sdp.processors.datasets.commoncrawl.Subprocess + output_manifest_file: ${workspace_dir}/manifest10.json + input_manifest_arg: "--manifest" + output_manifest_arg: "--output_filename" + arg_separator: "=" + cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \ + --language=en --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache \ + --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv" + # --overwrite_cache + + - _target_: sdp.processors.RenameFields + output_manifest_file: ${workspace_dir}/manifest11.json + rename_fields: {"normalized":"text"} + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest12.json + text_key: text + regex_params_list: + - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"} + - {"pattern": "^\\s*'*\\s*", "repl": ""} + - {"pattern": "'{2,}", "repl": "'"} + - {"pattern": '\\.{3}', "repl": '.'} + - {"pattern": '\$', "repl": ""} + - {"pattern": "[^A-Za-z'.,?]", "repl": " "} + - {"pattern": ' ', "repl": " "} + test_cases: + - {input: {text: "' jupiter and venus both shining in the golden rosy sky"}, output: {text: "jupiter and venus both shining in the golden rosy sky"}} + - {input: {text: "' may all the gold i have ever dreamed of be yours '"}, output: {text: "may all the gold i have ever dreamed of be yours"}} + - {input: {text: "''cause it''s an adult novel versus ya"}, output: {text: "cause it's an adult novel versus ya"}} + + - _target_: sdp.processors.datasets.commoncrawl.BLEUScore + output_manifest_file: ${workspace_dir}/manifest13.json + ref_field: text + hyp_field: pred_text + output_field: bleu + + - _target_: sdp.processors.datasets.commoncrawl.UseSonar + output_manifest_file: ${workspace_dir}/manifest14.json + input_text_field: text + input_audio_field: audio_filepath + output_field: sonar_dist + device: cuda + speech_encoder_model: sonar_speech_encoder_deu + text_encoder_model: text_sonar_basic_encoder diff --git a/dataset_configs/commoncrawl/small_en.yaml b/dataset_configs/commoncrawl/small_en.yaml new file mode 100644 index 00000000..910bc480 --- /dev/null +++ b/dataset_configs/commoncrawl/small_en.yaml @@ -0,0 +1,202 @@ +processors_to_run: "0:" +workspace_dir: /mnt/ssd8/cc_sdp/en + +processors: + - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue + input_manifest_file: /mnt/ssd8/cc_sdp/manifest8ps.json + output_manifest_file: ${workspace_dir}/manifest0.json + input_field: audio_lang + target_value: en + + - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue + output_manifest_file: ${workspace_dir}/manifest1.json + input_field: text_lang + target_value: en + + - _target_: sdp.processors.ASRInference + output_manifest_file: ${workspace_dir}/manifest2.json + pretrained_model: nvidia/stt_en_fastconformer_hybrid_large_pc + batch_size: 64 + + - _target_: sdp.processors.DropIfRegexMatch + output_manifest_file: ${workspace_dir}/manifest3.json + regex_patterns: + # - '://' + # - "(\\s)+(www)\\.[a-zA-Z0-9/]+(\\s|$)+" + - '\\x' + - "www\\.wiki" + - "www\\.usgs\\." + - 'é' + - 'ô' + - '×' + - 'š' + - 'ö' + - 'ß' + - 'ä' + - 'ü' + - '\u202a' + - 'č' + - 'ć' + - 'á' + - 'ã' + - 'â' + - 'ï' + - '\u2060' + - 'ñ' + - 'ŵ' + - 'à' + - 'ù' + - 'ò' + - 'ó' + - 'ő' + - 'ê' + - 'ă' + - 'ú' + - 'µ' + - '¿' + - 'ë' + - "è" + - "é" + - "È" + - "É" + + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest4.json + duplicate_fields: {"text":"orig_text"} + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest5.json + regex_params_list: + - {"pattern": '\[(.*?)\]', "repl": ' '} + - {"pattern": 'î', "repl": "i"} + - {"pattern": 'ì', "repl": "i"} + - {"pattern": 'í', "repl": "i"} + - {"pattern": '¡', "repl": "i"} + - {"pattern": '‚', "repl": ","} + - {"pattern": "’", "repl": "'"} + - {"pattern": "[-–—]", "repl": " "} + - {"pattern": '―', "repl": "-"} + - {"pattern": '—', "repl": "-"} + - {"pattern": '⁺', "repl": "+"} + - {"pattern": '“', "repl": '"'} + - {"pattern": '”', "repl": '"'} + - {"pattern": '…', "repl": '.'} + - {"pattern": '‘', "repl": "'"} + - {"pattern": '′', "repl": "'"} + - {"pattern": '`', "repl": "'"} + - {"pattern": '⁻', "repl": "-"} + - {"pattern": '‑', "repl": "-"} + - {"pattern": '¶', "repl": ' '} + - {"pattern": '«', "repl": '"'} + - {"pattern": '»', "repl": '"'} + - {"pattern": '„', "repl": '"'} + - {"pattern": '®', "repl": ' '} + - {"pattern": "%", "repl": ' '} + - {"pattern": '@', "repl": " "} + - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"} + - {"pattern": " ", "repl": " "} + + - _target_: sdp.processors.DropHighLowWordrate + output_manifest_file: ${workspace_dir}/manifest6.json + high_wordrate_threshold: 100 + low_wordrate_threshold: 0.01 + + - _target_: sdp.processors.DropIfRegexMatch + output_manifest_file: ${workspace_dir}/manifest7.json + regex_patterns: + - "^\\s*$" + + - _target_: sdp.processors.datasets.commoncrawl.Subprocess + output_manifest_file: ${workspace_dir}/manifest8.json + input_manifest_arg: "--manifest" + output_manifest_arg: "--output_filename" + arg_separator: "=" + cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \ + --language=en --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache \ + --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv" +# --overwrite_cache + # - _target_: sdp.processors.datasets.commoncrawl.Subprocess + # input_manifest_file: ${workspace_dir}/manifest6.json + # output_manifest_file: ${workspace_dir}/manifest7.json + # input_manifest_arg: "--input_file" + # output_manifest_arg: "--output_file" + # arg_separator: "=" + # cmd: ""python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize.py \ + # --language=es --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache \ + # --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv" + + - _target_: sdp.processors.RenameFields + output_manifest_file: ${workspace_dir}/manifest9.json + rename_fields: {"normalized":"text"} + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest10.json + text_key: text + regex_params_list: + - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"} + - {"pattern": "^\\s*'*\\s*", "repl": ""} + - {"pattern": "'{2,}", "repl": "'"} + - {"pattern": '\\.{3}', "repl": '.'} + - {"pattern": '\$', "repl": ""} + - {"pattern": "[^A-Za-z'.,?]", "repl": " "} + - {"pattern": ' ', "repl": " "} + test_cases: + - {input: {text: "' jupiter and venus both shining in the golden rosy sky"}, output: {text: "jupiter and venus both shining in the golden rosy sky"}} + - {input: {text: "' may all the gold i have ever dreamed of be yours '"}, output: {text: "may all the gold i have ever dreamed of be yours"}} + - {input: {text: "''cause it''s an adult novel versus ya"}, output: {text: "cause it's an adult novel versus ya"}} + + - _target_: sdp.processors.DropHighLowWordrate + output_manifest_file: ${workspace_dir}/manifest11.json + high_wordrate_threshold: 100 + low_wordrate_threshold: 0.01 + + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest12.json + duplicate_fields: {"text":"text_pc"} + + - _target_: sdp.processors.SubMakeLowercase + output_manifest_file: ${workspace_dir}/manifest13.json + text_key: text + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest14.json + text_key: text + regex_params_list: + - {"pattern": "[\\?\\.]", "repl": " "} + - {"pattern": ",", "repl": " "} + - {"pattern": " ", "repl": " "} + + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest15.json + duplicate_fields: {"pred_text":"pred_text_pc"} + + - _target_: sdp.processors.SubMakeLowercase + output_manifest_file: ${workspace_dir}/manifest16.json + text_key: pred_text + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest17.json + text_key: pred_text + regex_params_list: + - {"pattern": "[\\?\\.]", "repl": " "} + - {"pattern": ",", "repl": " "} + - {"pattern": " ", "repl": " "} + + - _target_: sdp.processors.DropIfRegexMatch + output_manifest_file: ${workspace_dir}/manifest18.json + text_key: text + regex_patterns: + - "^\\s*$" + + - _target_: sdp.processors.DropHighWER + output_manifest_file: ${workspace_dir}/manifest19.json + text_key: text + pred_text_key: pred_text + wer_threshold: 75 + + - _target_: sdp.processors.DropHighCER + output_manifest_file: ${workspace_dir}/manifest20.json + text_key: text + pred_text_key: pred_text + cer_threshold: 30 + \ No newline at end of file diff --git a/dataset_configs/commoncrawl/small_es.yaml b/dataset_configs/commoncrawl/small_es.yaml new file mode 100644 index 00000000..458819f3 --- /dev/null +++ b/dataset_configs/commoncrawl/small_es.yaml @@ -0,0 +1,187 @@ +processors_to_run: "0:" +workspace_dir: /mnt/ssd8/cc_sdp/es + +processors: + - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue + input_manifest_file: /mnt/ssd8/cc_sdp/manifest8ps.json + output_manifest_file: ${workspace_dir}/manifest0.json + input_field: audio_lang + target_value: es + + - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue + output_manifest_file: ${workspace_dir}/manifest1.json + input_field: text_lang + target_value: es + + - _target_: sdp.processors.ASRInference + output_manifest_file: ${workspace_dir}/manifest2.json + pretrained_model: nvidia/stt_es_fastconformer_hybrid_large_pc + batch_size: 64 + + - _target_: sdp.processors.DropIfRegexMatch + output_manifest_file: ${workspace_dir}/manifest3.json + regex_patterns: + # ó Ó á é É í Í ¿ ñ Ñ ú Ú + # - '://' + - '\\x' + - 'ô' + - '×' + - '½' + - 'š' + - 'ö' + - 'ß' + - 'ä' + - 'ü' + - '\u202a' + - 'č' + - 'ć' + - 'ã' + - 'â' + - 'ï' + - '\u2060' + - 'ŵ' + - 'ő' + - 'ê' + - 'ă' + - 'µ' + - '³' + - 'ë' + - "%" + + - _target_: sdp.processors.DuplicateFields + duplicate_fields: {"text":"orig_text"} + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest5.json + regex_params_list: + - {"pattern": 'î', "repl": "i"} + - {"pattern": 'ì', "repl": "í"} + - {"pattern": 'è', "repl": "é"} + - {"pattern": 'È', "repl": "É"} + - {"pattern": 'ù', "repl": "ú"} + - {"pattern": 'ò', "repl": "ó"} + - {"pattern": 'à', "repl": "á"} + - {"pattern": '‚', "repl": ","} + - {"pattern": "’", "repl": "'"} + - {"pattern": "[-–—]", "repl": " "} + - {"pattern": '―', "repl": "-"} + - {"pattern": '—', "repl": "-"} + - {"pattern": '⁺', "repl": "+"} + - {"pattern": '“', "repl": '"'} + - {"pattern": '”', "repl": '"'} + - {"pattern": '…', "repl": '.'} + - {"pattern": '‘', "repl": "'"} + - {"pattern": '′', "repl": "'"} + - {"pattern": '`', "repl": "'"} + - {"pattern": '⁻', "repl": "-"} + - {"pattern": '‑', "repl": "-"} + - {"pattern": '¶', "repl": ' '} + - {"pattern": '«', "repl": '"'} + - {"pattern": '»', "repl": '"'} + - {"pattern": '„', "repl": '"'} + - {"pattern": '®', "repl": ' '} + - {"pattern": '•', "repl": " "} + - {"pattern": '●', "repl": " "} + - {"pattern": '@', "repl": " "} + - {"pattern": " ", "repl": " "} + + - _target_: sdp.processors.DropHighLowWordrate + output_manifest_file: ${workspace_dir}/manifest6.json + high_wordrate_threshold: 100 + low_wordrate_threshold: 0.01 + + - _target_: sdp.processors.datasets.commoncrawl.Subprocess + output_manifest_file: ${workspace_dir}/manifest7.json + input_manifest_arg: "--input_file" + output_manifest_arg: "--output_file" + arg_separator: "=" + cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize.py \ + --language=es --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache \ + --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/es/data/whitelist.tsv" + + # - _target_: sdp.processors.datasets.commoncrawl.Subprocess + # input_manifest_file: ${workspace_dir}/manifest6.json + # output_manifest_file: ${workspace_dir}/manifest7.json + # input_manifest_arg: "--manifest" + # output_manifest_arg: "--output_filename" + # arg_separator: "=" + # cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \ + # --language=es --n_jobs=-1 --batch_size=600 --manifest_text_field=text" + + - _target_: sdp.processors.RenameFields + output_manifest_file: ${workspace_dir}/manifest8.json + rename_fields: {"normalized":"text"} + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest9.json + text_key: text + regex_params_list: + - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"} + - {"pattern": "^\\s*'*\\s*", "repl": ""} + - {"pattern": "'{2,}", "repl": "'"} + - {"pattern": '!', "repl": '.'} + - {"pattern": '\\.{3}', "repl": '.'} + - {"pattern": '\$', "repl": ""} + - {"pattern": "[^a-zA-ZóÓáéÉíÍñÑúÚ'.,?¿]", "repl": " "} + - {"pattern": ' ', "repl": " "} + test_cases: + - {input: {text: "' jupiter and venus both shining in the golden rosy sky"}, output: {text: "jupiter and venus both shining in the golden rosy sky"}} + - {input: {text: "' may all the gold i have ever dreamed of be yours '"}, output: {text: "may all the gold i have ever dreamed of be yours"}} + - {input: {text: "''cause it''s an adult novel versus ya"}, output: {text: "cause it's an adult novel versus ya"}} + + - _target_: sdp.processors.DropHighLowWordrate + output_manifest_file: ${workspace_dir}/manifest10.json + high_wordrate_threshold: 100 + low_wordrate_threshold: 0.01 + + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest11.json + duplicate_fields: {"text":"text_pc"} + + - _target_: sdp.processors.SubMakeLowercase + output_manifest_file: ${workspace_dir}/manifest12.json + text_key: text + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest13.json + text_key: text + regex_params_list: + - {"pattern": "[\\?\\.]", "repl": " "} + - {"pattern": ",", "repl": " "} + - {"pattern": "¿", "repl": " "} + - {"pattern": " ", "repl": " "} + + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest14.json + duplicate_fields: {"pred_text":"pred_text_pc"} + + - _target_: sdp.processors.SubMakeLowercase + output_manifest_file: ${workspace_dir}/manifest15.json + text_key: pred_text + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest16.json + text_key: pred_text + regex_params_list: + - {"pattern": "[\\?\\.]", "repl": " "} + - {"pattern": ",", "repl": " "} + - {"pattern": "¿", "repl": " "} + - {"pattern": " ", "repl": " "} + + - _target_: sdp.processors.DropIfRegexMatch + output_manifest_file: ${workspace_dir}/manifest17.json + text_key: text + regex_patterns: + - "^\\s*$" + + - _target_: sdp.processors.DropHighWER + output_manifest_file: ${workspace_dir}/manifest18.json + text_key: text + pred_text_key: pred_text + wer_threshold: 75 + + - _target_: sdp.processors.DropHighCER + output_manifest_file: ${workspace_dir}/manifest19.json + text_key: text + pred_text_key: pred_text + cer_threshold: 30 diff --git a/dataset_configs/commoncrawl/small_fr.yaml b/dataset_configs/commoncrawl/small_fr.yaml new file mode 100644 index 00000000..62165784 --- /dev/null +++ b/dataset_configs/commoncrawl/small_fr.yaml @@ -0,0 +1,169 @@ +processors_to_run: "3:" +workspace_dir: /mnt/ssd8/cc_sdp/fr + +processors: + - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue + input_manifest_file: /mnt/ssd8/cc_sdp/manifest7.json + output_manifest_file: ${workspace_dir}/manifest0.json + input_field: audio_lang + preserve_value: fr + + - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue + output_manifest_file: ${workspace_dir}/manifest1.json + input_field: text_lang + preserve_value: fr + + - _target_: sdp.processors.ASRInference + input_manifest_file: ${workspace_dir}/manifest1.json + output_manifest_file: ${workspace_dir}/manifest2.json + pretrained_model: nvidia/stt_fr_conformer_transducer_large # nvidia/stt_fr_fastconformer_hybrid_large_pc + batch_size: 64 + + - _target_: sdp.processors.DuplicateFields + duplicate_fields: {"text":"orig_text"} + + - _target_: sdp.processors.DropIfRegexMatch + input_manifest_file: ${workspace_dir}/manifest2.json + output_manifest_file: ${workspace_dir}/manifest3.json + regex_patterns: + # â à ê è È é É ë î ì ï ô û ù ü ÿ ç œ æ + # - '://' + - '\\x' + - '×' + - '½' + - 'š' + - '⁶' + - 'ö' + - 'ß' + - 'ä' + - 'ü' + - '\u202a' + - 'č' + - 'ć' + - 'á' + - 'ã' + - 'ï' + - '²' + - '\u2060' + - '°' + - 'ñ' + - 'ŵ' + - 'ù' + - 'ò' + - 'ó' + - 'ő' + - 'ă' + - 'ú' + - 'µ' + - '¿' + - 'ë' + - "%" + + + - _target_: sdp.processors.SubRegex + # input_manifest_file: ${workspace_dir}/manifest3.json + output_manifest_file: ${workspace_dir}/manifest4.json + max_workers: 20 + regex_params_list: + - {"pattern": '¡', "repl": "i"} + - {"pattern": '‚', "repl": ","} + - {"pattern": "’", "repl": "'"} + - {"pattern": "[-–—]", "repl": " "} + - {"pattern": '―', "repl": "-"} + - {"pattern": '—', "repl": "-"} + - {"pattern": '⁺', "repl": "+"} + - {"pattern": '“', "repl": '"'} + - {"pattern": '”', "repl": '"'} + - {"pattern": '…', "repl": '.'} + - {"pattern": '‘', "repl": "'"} + - {"pattern": '′', "repl": "'"} + - {"pattern": '`', "repl": "'"} + - {"pattern": '⁻', "repl": "-"} + - {"pattern": '‑', "repl": "-"} + - {"pattern": '¶', "repl": ' '} + - {"pattern": '«', "repl": '"'} + - {"pattern": '»', "repl": '"'} + - {"pattern": '„', "repl": '"'} + - {"pattern": '®', "repl": ' '} + - {"pattern": '@', "repl": " "} + - {"pattern": " ", "repl": " "} + + - _target_: sdp.processors.DropHighLowWordrate + # input_manifest_file: ${workspace_dir}/manifest4.json + output_manifest_file: ${workspace_dir}/manifest5.json + high_wordrate_threshold: 1000 + low_wordrate_threshold: 0.001 + + + - _target_: sdp.processors.SubRegex + # input_manifest_file: ${workspace_dir}/manifest5.json + output_manifest_file: ${workspace_dir}/manifest6.json + regex_params_list: + # - {"pattern": "'", "repl": " "} + # - {"pattern": '\-', "repl": " "} + # - {"pattern": '[\[\]\":\(\);\\\+\*]', "repl": ' '} + - {"pattern": '=', "repl": " "} + - {"pattern": '$', "repl": " "} + - {"pattern": '#', "repl": " "} + - {"pattern": '/', "repl": " "} + - {"pattern": '>', "repl": " "} + - {"pattern": '<', "repl": " "} + - {"pattern": '&', "repl": " "} + - {"pattern": '@', "repl": " "} + - {"pattern": '\\.{3}', "repl": '.'} + - {"pattern": ' ', "repl": " "} + + + - _target_: sdp.processors.DropHighLowWordrate + # input_manifest_file: ${workspace_dir}/manifest6.json + output_manifest_file: ${workspace_dir}/manifest7.json + high_wordrate_threshold: 100 + low_wordrate_threshold: 0.01 + + - _target_: sdp.processors.DuplicateFields + # input_manifest_file: ${workspace_dir}/manifest7.json + output_manifest_file: ${workspace_dir}/manifest8.json + duplicate_fields: {"text":"text_pc"} + + - _target_: sdp.processors.SubMakeLowercase + output_manifest_file: ${workspace_dir}/manifest9.json + text_key: text + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest10.json + text_key: text + regex_params_list: + - {"pattern": "[\\?\\.\\!]", "repl": " "} + - {"pattern": ",", "repl": " "} + - {"pattern": " ", "repl": " "} + + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest11.json + duplicate_fields: {"pred_text":"pred_text_pc"} + + - _target_: sdp.processors.SubMakeLowercase + output_manifest_file: ${workspace_dir}/manifest12.json + text_key: pred_text + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest13.json + text_key: pred_text + regex_params_list: + - {"pattern": "[\\?\\.\\!]", "repl": " "} + - {"pattern": ",", "repl": " "} + - {"pattern": " ", "repl": " "} + + - _target_: sdp.processors.DropHighWER + input_manifest_file: ${workspace_dir}/manifest13.json + output_manifest_file: ${workspace_dir}/manifest14.json + text_key: text + pred_text_key: pred_text + wer_threshold: 75 + + - _target_: sdp.processors.DropHighCER + input_manifest_file: ${workspace_dir}/manifest14.json + output_manifest_file: ${workspace_dir}/manifest15.json + text_key: text + pred_text_key: pred_text + cer_threshold: 30 + \ No newline at end of file diff --git a/dataset_configs/commoncrawl/small_pl.yaml b/dataset_configs/commoncrawl/small_pl.yaml new file mode 100644 index 00000000..97808125 --- /dev/null +++ b/dataset_configs/commoncrawl/small_pl.yaml @@ -0,0 +1,175 @@ +processors_to_run: "3:" +workspace_dir: /mnt/ssd8/cc_sdp/pl + +processors: + - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue + input_manifest_file: /mnt/ssd8/cc_sdp/manifest7.json + output_manifest_file: ${workspace_dir}/manifest0.json + input_field: audio_lang + preserve_value: pl + + - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue + output_manifest_file: ${workspace_dir}/manifest1.json + input_field: text_lang + preserve_value: pl + + - _target_: sdp.processors.ASRInference + input_manifest_file: ${workspace_dir}/manifest1.json + output_manifest_file: ${workspace_dir}/manifest2.json + pretrained_model: nvidia/stt_pl_fastconformer_hybrid_large_pc + batch_size: 64 + + - _target_: sdp.processors.DropIfRegexMatch + input_manifest_file: ${workspace_dir}/manifest2.json + output_manifest_file: ${workspace_dir}/manifest3.json + regex_patterns: + # ę ą ł Ł ć Ć ż Ż ś Ś ń ó Ó ź Ź + # - '://' + # - '\\x' + - 'é' + - 'ô' + - '×' + - '½' + - 'š' + - '⁶' + - 'ö' + - 'ß' + - 'ä' + - 'ü' + - '\u202a' + - 'č' + - 'á' + - 'ã' + - 'â' + - 'ï' + - '\u2060' + - 'ñ' + - 'ŵ' + - 'à' + - 'ù' + - 'ò' + - 'ő' + - 'ê' + - 'ă' + - 'ú' + - 'µ' + - '¿' + - 'ë' + - "è" + - "é" + - "È" + - "É" + - "\\d" + + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest4.json + duplicate_fields: {"text":"orig_text"} + + - _target_: sdp.processors.SubRegex + # input_manifest_file: ${workspace_dir}/manifest4.json + output_manifest_file: ${workspace_dir}/manifest5.json + regex_params_list: + - {"pattern": 'î', "repl": "i"} + - {"pattern": 'ì', "repl": "i"} + - {"pattern": 'í', "repl": "i"} + - {"pattern": '¡', "repl": "i"} + - {"pattern": '‚', "repl": ","} + - {"pattern": "’", "repl": "'"} + - {"pattern": "[-–—]", "repl": " "} + - {"pattern": '―', "repl": "-"} + - {"pattern": '—', "repl": "-"} + - {"pattern": '⁺', "repl": "+"} + - {"pattern": '“', "repl": '"'} + - {"pattern": '”', "repl": '"'} + - {"pattern": '…', "repl": '.'} + - {"pattern": '‘', "repl": "'"} + - {"pattern": '′', "repl": "'"} + - {"pattern": '`', "repl": "'"} + - {"pattern": '⁻', "repl": "-"} + - {"pattern": '‑', "repl": "-"} + - {"pattern": '¶', "repl": ' '} + - {"pattern": '«', "repl": '"'} + - {"pattern": '»', "repl": '"'} + - {"pattern": '„', "repl": '"'} + - {"pattern": '®', "repl": ' '} + - {"pattern": " ", "repl": " "} + + - _target_: sdp.processors.DropHighLowWordrate + # input_manifest_file: ${workspace_dir}/manifest5.json + output_manifest_file: ${workspace_dir}/manifest6.json + high_wordrate_threshold: 1000 + low_wordrate_threshold: 0.001 + + + - _target_: sdp.processors.SubRegex + # input_manifest_file: ${workspace_dir}/manifest6.json + output_manifest_file: ${workspace_dir}/manifest7.json + max_workers: 20 + regex_params_list: + - {"pattern": "'", "repl": " "} + - {"pattern": '[\[\]\":\(\);\\\-\+\*]', "repl": ' '} + - {"pattern": '=', "repl": " "} + - {"pattern": '$', "repl": " "} + - {"pattern": '#', "repl": " "} + - {"pattern": '/', "repl": " "} + - {"pattern": '>', "repl": " "} + - {"pattern": '<', "repl": " "} + - {"pattern": '&', "repl": " "} + - {"pattern": '@', "repl": " "} + - {"pattern": 'ç', "repl": "c"} + - {"pattern": '\\.{3}', "repl": '.'} + - {"pattern": ' ', "repl": " "} + + + - _target_: sdp.processors.DropHighLowWordrate + # input_manifest_file: ${workspace_dir}/manifest7.json + output_manifest_file: ${workspace_dir}/manifest8.json + high_wordrate_threshold: 100 + low_wordrate_threshold: 0.01 + + - _target_: sdp.processors.DuplicateFields + # input_manifest_file: ${workspace_dir}/manifest8.json + output_manifest_file: ${workspace_dir}/manifest9.json + duplicate_fields: {"text":"text_pc"} + + - _target_: sdp.processors.SubMakeLowercase + output_manifest_file: ${workspace_dir}/manifest10.json + text_key: text + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest11.json + text_key: text + regex_params_list: + - {"pattern": "[\\?\\.\\!]", "repl": " "} + - {"pattern": ",", "repl": " "} + - {"pattern": " ", "repl": " "} + + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest12.json + duplicate_fields: {"pred_text":"pred_text_pc"} + + - _target_: sdp.processors.SubMakeLowercase + output_manifest_file: ${workspace_dir}/manifest13.json + text_key: pred_text + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest14.json + text_key: pred_text + regex_params_list: + - {"pattern": "[\\?\\.\\!]", "repl": " "} + - {"pattern": ",", "repl": " "} + - {"pattern": " ", "repl": " "} + + - _target_: sdp.processors.DropHighWER + input_manifest_file: ${workspace_dir}/manifest14.json + output_manifest_file: ${workspace_dir}/manifest15.json + text_key: text + pred_text_key: pred_text + wer_threshold: 75 + + - _target_: sdp.processors.DropHighCER + input_manifest_file: ${workspace_dir}/manifest15.json + output_manifest_file: ${workspace_dir}/manifest16.json + text_key: text + pred_text_key: pred_text + cer_threshold: 30 \ No newline at end of file diff --git a/dataset_configs/commoncrawl/small_sentence.yaml b/dataset_configs/commoncrawl/small_sentence.yaml new file mode 100644 index 00000000..7c297462 --- /dev/null +++ b/dataset_configs/commoncrawl/small_sentence.yaml @@ -0,0 +1,72 @@ +processors_to_run: "0:" +workspace_dir: /mnt/ssd8/cc_sdp + +processors: + - _target_: sdp.processors.datasets.commoncrawl.CreateInitialManifestCC + raw_data_dir: /mnt/ssd8/cc_download/videos_and_transcripts_large_16_cores_output/video_output2 + output_manifest_file: ${workspace_dir}/manifest0.json + resampled_audio_dir: ${workspace_dir}/audio/ + target_samplerate: 16000 + target_nchannels: 1 + audio_field: "audios" + video_field: "videos" + key_field: "key" + text_field: "texts" + + - _target_: sdp.processors.datasets.commoncrawl.TxtToVtt + input_manifest_file: ${workspace_dir}/manifest0.json + output_manifest_file: ${workspace_dir}/manifest1.json + vtt_files_dir: ${workspace_dir}/vtts/ + key_field: "key" + text_field: "texts" + vtt_field: "vtt_filepath" + + - _target_: sdp.processors.datasets.commoncrawl.AllVttText + input_manifest_file: ${workspace_dir}/manifest1.json + output_manifest_file: ${workspace_dir}/manifest2.json + input_filepath_field: vtt_filepath + output_text_field: vtt_text + + - _target_: sdp.processors.datasets.commoncrawl.TextLid + input_manifest_file: ${workspace_dir}/manifest2.json + output_manifest_file: ${workspace_dir}/manifest3.json + input_text_field: vtt_text + output_lang_field: text_lang + device: cuda + pretrained_model: "jb2k/bert-base-multilingual-cased-language-detection" + + - _target_: sdp.processors.datasets.commoncrawl.Lang2Iso + output_manifest_file: ${workspace_dir}/manifest4.json + input_lang_field: text_lang + output_lang_field: text_lang + + - _target_: sdp.processors.datasets.commoncrawl.AudioLid + output_manifest_file: ${workspace_dir}/manifest5.json + input_audio_field: audios + output_lang_field: audio_lang + device: cuda + pretrained_model: "langid_ambernet" + + - _target_: sdp.processors.datasets.commoncrawl.ReadParquet + raw_data_dir: /mnt/ssd8/cc_download/videos_and_transcripts_large_16_cores_output/video_output2 + output_manifest_file: ${workspace_dir}/manifest6ps.json + output_text_field: url + key_field: key + + - _target_: sdp.processors.datasets.commoncrawl.SplitByVttSentence + output_manifest_file: ${workspace_dir}/manifest7ps.json + splited_audio_dir: ${workspace_dir}/splited_s/ + source_audio_field: audios + audio_lang_field: audio_lang + text_lang_field: text_lang + key_field: "key" + target_audio_field: "audio_filepath" + duration_field: "duration" + text_field: "text" + vtt_field: "vtt_filepath" + # audio duration splited 532.25 + + - _target_: sdp.processors.DropHighLowDuration + output_manifest_file: ${workspace_dir}/manifest8ps.json + high_duration_threshold: 40 + low_duration_threshold: 0.02 diff --git a/sdp/processors/datasets/commoncrawl/__init__.py b/sdp/processors/datasets/commoncrawl/__init__.py new file mode 100644 index 00000000..e1c87620 --- /dev/null +++ b/sdp/processors/datasets/commoncrawl/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .commoncrawl import UseSonar, BLEUScore, Subprocess, NmtSubprocess, PreserveByValue, Lang2Iso, SplitByVttSentence, SplitByVtt, AudioLid, TextLid, AllVttText, TxtToVtt, ReadParquet, CreateInitialManifestCC diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py new file mode 100644 index 00000000..c63309bd --- /dev/null +++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py @@ -0,0 +1,645 @@ +import os +import json +import subprocess +from tqdm import tqdm +import pandas as pd +from typing import Dict, List, Union +from pathlib import Path +from operator import lt, le, eq, ne, ge, gt +import soundfile as sf +from sacrebleu import BLEU + +from sdp.processors.base_processor import BaseProcessor, BaseParallelProcessor, DataEntry +from sdp.logging import logger +from sdp.processors.datasets.commoncrawl import ffmpeg_convert, txt2vtt, make_trans_list, get_vtt_text, text2lid, load_manifest, read_jsonl, write_jsonl, split_by_vtt_new + +class UseSonar(BaseProcessor): + """ + Args: + resampled_audio_dir (str): where to put re-sampled and trimmed wav files. + """ + def __init__( + self, + input_text_field: str, + input_audio_field: str, + output_field: str, + speech_encoder_model: str, + text_encoder_model: str, + device: str = "cuda", + **kwargs, + ): + super().__init__(**kwargs) + import torch # importing after nemo to make sure users first install nemo, instead of torch, then nemo + from torch.nn import PairwiseDistance + + from sonar.models.sonar_speech.loader import load_sonar_speech_model + from sonar.models.sonar_text import ( + load_sonar_text_decoder_model, + load_sonar_text_encoder_model, + load_sonar_tokenizer, + ) + self.output_field = output_field + self.input_text_field = input_text_field + self.input_audio_field = input_audio_field + self.device = device + self.text_encoder_model = load_sonar_text_encoder_model(text_encoder_model, device=self.device).eval() + self.text_tokenizer = load_sonar_tokenizer(text_encoder_model) + self.speech_encoder_model = load_sonar_speech_model(speech_encoder_model, device=self.device).eval() + self.pdist = PairwiseDistance(p=2) + + def process(self): + from sonar.inference_pipelines.text import TextToEmbeddingModelPipeline + from sonar.inference_pipelines.speech import SpeechToEmbeddingModelPipeline + s2vec_model = SpeechToEmbeddingModelPipeline(encoder=self.speech_encoder_model) + text_embedding_pipeline = TextToEmbeddingModelPipeline(self.text_encoder_model, self.text_tokenizer) + + manifest, dir_list = load_manifest(Path(self.input_manifest_file), keys = [self.input_audio_field, self.input_text_field]) + + text_emb = text_embedding_pipeline.predict(input = dir_list[self.input_text_field], + batch_size = 64, + source_lang="eng_Latn") + print("text_emb", type(text_emb), text_emb) + + audio_emb = s2vec_model.predict(input = dir_list[self.input_audio_field], + batch_size = 64, + n_parallel = 20, + pad_idx = 0, + n_prefetched_batches = 2,) + print("audio_emb", type(audio_emb), audio_emb) + + pdist = self.pdist(text_emb, audio_emb).numpy().astype(float) + print("pdist", pdist) + + Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True) + assert(len(manifest)==len(pdist)) + with Path(self.output_manifest_file).open('w') as f: + for item, dist in tqdm(zip(manifest,pdist)): + item[self.output_field] = dist + f.write(json.dumps(item, ensure_ascii=False) + '\n') + +class BLEUScore(BaseParallelProcessor): + """ + Args: + resampled_audio_dir (str): where to put re-sampled and trimmed wav files. + """ + def __init__( + self, + ref_field: str, + hyp_field: str, + output_field: str, + **kwargs, + ): + super().__init__(**kwargs) + self.ref_field = ref_field + self.hyp_field = hyp_field + self.output_field = output_field + self.scorer = BLEU(effective_order=True) + + def process_dataset_entry(self, data_entry): + ref = data_entry[self.ref_field] + hyp = data_entry[self.hyp_field] + + res = self.scorer.sentence_score(hypothesis=hyp, + references=[ref]) + data_entry[self.output_field] = res.score + return [DataEntry(data=data_entry)] + +class Subprocess(BaseProcessor): + """This processor performs ASR inference on each utterance of the input manifest. + + ASR predictions will be saved in the ``pred_text`` key. + + Args: + pretrained_model (str): the name of the pretrained NeMo ASR model + which will be used to do inference. + batch_size (int): the batch size to use for ASR inference. Defaults to 32. + + Returns: + The same data as in the input manifest with an additional field + ``pred_text`` containing ASR model's predictions. + """ + + def __init__( + self, + cmd: str, + input_manifest_arg: str = "", + output_manifest_arg: str = "", + arg_separator: str = "=", + **kwargs, + ): + super().__init__(**kwargs) + self.input_manifest_arg = input_manifest_arg + self.output_manifest_arg = output_manifest_arg + self.arg_separator = arg_separator + self.cmd = cmd + + def process(self): + """This will add "pred_text" key into the output manifest.""" + os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True) + if self.cmd.find(self.input_manifest_file) != -1 or self.cmd.find(self.output_manifest_file) != -1: + logger.error("input_manifest_file "+self.input_manifest_file+" and output_manifest_file "+self.output_manifest_file+" should be exluded from cmd line!") + raise ValueError + process_args = [x for x in self.cmd.split(" ") if x] + if self.arg_separator == " ": + if self.input_manifest_arg: + process_args.extend([self.input_manifest_arg, self.input_manifest_file]) + if self.output_manifest_arg: + process_args.extend([self.output_manifest_arg, self.output_manifest_file]) + else: + if self.input_manifest_arg: + process_args.extend([self.input_manifest_arg + self.arg_separator + self.input_manifest_file]) + if self.output_manifest_arg: + process_args.extend([self.output_manifest_arg + self.arg_separator + self.output_manifest_file]) + + subprocess.run(process_args) + +class NmtSubprocess(Subprocess): + """This processor performs ASR inference on each utterance of the input manifest. + + ASR predictions will be saved in the ``pred_text`` key. + + Args: + pretrained_model (str): the name of the pretrained NeMo ASR model + which will be used to do inference. + batch_size (int): the batch size to use for ASR inference. Defaults to 32. + + Returns: + The same data as in the input manifest with an additional field + ``pred_text`` containing ASR model's predictions. + """ + + def __init__( + self, + input_field: str, + output_field: str, + srctext_file: str, + tgtout_file: str, + **kwargs, + ): + super().__init__(**kwargs) + self.input_field = input_field + self.output_field = output_field + self.srctext_file = srctext_file + self.tgtout_file = tgtout_file + self.cmd = self.cmd + " --srctext" + self.arg_separator + self.srctext_file + " --tgtout" + self.arg_separator + self.tgtout_file + + def process(self): + df1 = read_jsonl(self.input_manifest_file) + with Path(self.srctext_file).open('w') as f: + for input_field in df1[self.input_field]: + f.write(input_field + "\n") + + super().process() + + with Path(self.tgtout_file).open('r') as f: + tgtout = [l.strip() for l in f] + df1[self.output_field] = tgtout + write_jsonl(df1, self.output_manifest_file) + +class PreserveByValue(BaseParallelProcessor): + """ + Args: + resampled_audio_dir (str): where to put re-sampled and trimmed wav files. + """ + def __init__( + self, + input_field: str, + target_value: Union[int, str], + operator: str = "eq", + **kwargs, + ): + super().__init__(**kwargs) + self.input_field = input_field + self.target_value = target_value + if operator == "lt": + self.operator = lt + elif operator == "le": + self.operator = le + elif operator == "eq": + self.operator = eq + elif operator == "ne": + self.operator = ne + elif operator == "ge": + self.operator = ge + elif operator == "gt": + self.operator = gt + + def process_dataset_entry(self, data_entry): + input_value = data_entry[self.input_field] + target = self.target_value + if self.operator(input_value, target): + return [DataEntry(data=data_entry)] + else: + return [DataEntry(data=None)] + +class Lang2Iso(BaseParallelProcessor): + """ + Args: + resampled_audio_dir (str): where to put re-sampled and trimmed wav files. + """ + def __init__( + self, + input_lang_field: str, + output_lang_field: str, + **kwargs, + ): + super().__init__(**kwargs) + self.input_lang_field = input_lang_field + self.output_lang_field = output_lang_field + self.iso_m = {'English':'en', 'Spanish':'es', 'Basque':'eu', 'Dutch':'nl', 'Welsh':'cy', 'Italian':'it', + 'Catalan':'ca', 'Maltese':'mt', 'Swedish':'sv', 'French':'fr', 'German':'de', 'Chuvash':'cv', + 'Kinyarwanda':'rw', 'Polish':'pl', 'Kabyle':'kab', 'Interlingua': 'ua', 'Portuguese': 'pt', 'Hakha_Chin': 'cnh', 'Romansh_Sursilvan':'roh', 'Breton':'br', 'Esperanto':'epo', 'Czech':'ces', 'Latvian':'lav', + 'Indonesian':'ind', 'Slovenian':'slv', 'Turkish':'tur', 'Frisian':'frr', 'Tatar':'tat', 'Persian':'fas', 'Estonian':'est', 'Romanian':'rum', 'Chinese_Hongkong':'zh', 'Chinese_Taiwan':'zh', + 'Georgian':'kat', 'Kyrgyz':'kir', 'Dhivehi':'div', 'Sakha':'sah'} + + def process_dataset_entry(self, data_entry): + data_entry[self.output_lang_field] = self.iso_m[data_entry[self.input_lang_field]] + return [DataEntry(data=data_entry)] + +class SplitByVttSentence(BaseParallelProcessor): + """ + Args: + resampled_audio_dir (str): where to put re-sampled and trimmed wav files. + """ + def __init__( + self, + splited_audio_dir: str, + source_audio_field: str, + text_lang_field: str, + audio_lang_field: str, + key_field: str, + target_audio_field: str, + duration_field: str, + text_field: str, + vtt_field: str, + duration_threshold: float = 10.0, + **kwargs, + ): + super().__init__(**kwargs) + self.splited_audio_dir = splited_audio_dir + self.source_audio_field = source_audio_field + self.text_lang_field = text_lang_field + self.audio_lang_field = audio_lang_field + self.key_field = key_field + self.target_audio_field = target_audio_field + self.duration_field = duration_field + self.text_field = text_field + self.vtt_field = vtt_field + self.duration_threshold = duration_threshold + + def prepare(self): + os.makedirs(self.splited_audio_dir, exist_ok=True) + + def process_dataset_entry(self, data_entry): + key = data_entry[self.key_field] + vtt_file = data_entry[self.vtt_field] + source_audio = data_entry[self.source_audio_field] + res_list = [] + + if os.path.isfile(source_audio): + data, samplerate = sf.read(source_audio) + text_list, start_s, end_s = split_by_vtt_new(vtt_file, samplerate) + text_c = '' + start_c, end_c = 0, 0 + if text_list: + for text, start_sr, end_sr in zip(text_list, start_s, end_s): + text_c += " " + text + if start_c==0: + start_c = start_sr + else: + pass + end_c = end_sr + if len(text_c)>0 and (end_c - start_c > self.duration_threshold * 16000 or text_c[-1] == "." or text_c[-1] == "?"): + res_list.append(self.makeDataEntry(data_entry, data, vtt_file, samplerate, text_c, key, start_c, end_c)) + text_c = '' + start_c, end_c = 0, 0 + else: + pass + if len(text_c)>0 and start_c!=0: + res_list.append(self.makeDataEntry(data_entry, data, vtt_file, samplerate, text_c, key, start_c, end_c)) + + return res_list + + def makeDataEntry(self, data_entry, data, vtt_file, samplerate, text_c, key, start_c, end_c): + data_sample = data[start_c:end_c] + wav_save_file = os.path.join(self.splited_audio_dir, '/'.join(os.path.splitext(vtt_file)[0].split('/')[-2:]), str(int(start_c/16))+"-"+str(int(end_c/16))+".wav") + os.makedirs(os.path.split(wav_save_file)[0], exist_ok=True) + sf.write(wav_save_file, data_sample, samplerate) + return DataEntry(data = {self.target_audio_field: wav_save_file, + self.duration_field: data_sample.shape[0]/samplerate, + self.text_field: text_c, + self.audio_lang_field: data_entry[self.audio_lang_field], + self.text_lang_field: data_entry[self.text_lang_field], + self.key_field: key}) + +class SplitByVtt(BaseParallelProcessor): + """ + Args: + resampled_audio_dir (str): where to put re-sampled and trimmed wav files. + """ + def __init__( + self, + splited_audio_dir: str, + source_audio_field: str, + text_lang_field: str, + audio_lang_field: str, + key_field: str, + target_audio_field: str, + duration_field: str, + text_field: str, + vtt_field: str, + **kwargs, + ): + super().__init__(**kwargs) + self.splited_audio_dir = splited_audio_dir + self.source_audio_field = source_audio_field + self.text_lang_field = text_lang_field + self.audio_lang_field = audio_lang_field + self.key_field = key_field + self.target_audio_field = target_audio_field + self.duration_field = duration_field + self.text_field = text_field + self.vtt_field = vtt_field + + def prepare(self): + os.makedirs(self.splited_audio_dir, exist_ok=True) + + def process_dataset_entry(self, data_entry): + key = data_entry[self.key_field] + vtt_file = data_entry[self.vtt_field] + source_audio = data_entry[self.source_audio_field] + res_list = [] + + if os.path.isfile(source_audio): + wav_list, text_list, dur_list = split_by_vtt(vtt_file, source_audio, self.splited_audio_dir) + if wav_list: + for wav, text, dur in zip(wav_list, text_list, dur_list): + res_list.append(DataEntry(data = {self.target_audio_field: wav, + self.duration_field: dur, + self.text_field: text, + self.audio_lang_field: data_entry[self.audio_lang_field], + self.text_lang_field: data_entry[self.text_lang_field], + self.key_field: key})) + return res_list + +class AudioLid(BaseProcessor): + """ + Args: + resampled_audio_dir (str): where to put re-sampled and trimmed wav files. + """ + def __init__( + self, + input_audio_field: str, + pretrained_model: str, + output_lang_field: str, + device: str, + **kwargs, + ): + super().__init__(**kwargs) + self.input_audio_field = input_audio_field + self.pretrained_model = pretrained_model + self.output_lang_field = output_lang_field + self.device = device + + def process(self): + import torch # importing after nemo to make sure users first install nemo, instead of torch, then nemo + import nemo.collections.asr as nemo_asr + + model = nemo_asr.models.EncDecSpeakerLabelModel.from_pretrained(model_name=self.pretrained_model) + + if self.device is None: + if torch.cuda.is_available(): + model = model.cuda() + else: + model = model.cpu() + else: + model = model.to(self.device) + + manifest = load_manifest(Path(self.input_manifest_file)) + + Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True) + with Path(self.output_manifest_file).open('w') as f: + for item in tqdm(manifest): + audio_file = item[self.input_audio_field] + + try: + lang = model.get_label(audio_file, 60*5) + except Exception as e: + logger.warning("AudioLid " + audio_file+ " " + str(e)) + lang = None + + if lang: + item[self.output_lang_field] = lang + f.write(json.dumps(item, ensure_ascii=False) + '\n') + + +class TextLid(BaseProcessor): + """ + Args: + resampled_audio_dir (str): where to put re-sampled and trimmed wav files. + """ + def __init__( + self, + input_text_field: str, + pretrained_model: str, + output_lang_field: str, + device: str, + **kwargs, + ): + super().__init__(**kwargs) + self.input_text_field = input_text_field + self.pretrained_model = pretrained_model + self.output_lang_field = output_lang_field + self.device = device + + def process(self): + import torch # importing after nemo to make sure users first install nemo, instead of torch, then nemo + from transformers import AutoTokenizer, AutoModelForSequenceClassification + + tokenizer = AutoTokenizer.from_pretrained(self.pretrained_model) + text_model = AutoModelForSequenceClassification.from_pretrained(self.pretrained_model) + + if self.device is None: + if torch.cuda.is_available(): + text_model = text_model.cuda() + else: + text_model = text_model.cpu() + else: + text_model = text_model.to(self.device) + + manifest = load_manifest(Path(self.input_manifest_file)) + + Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True) + + with Path(self.output_manifest_file).open('w') as f: + for item in tqdm(manifest): + text = item[self.input_text_field] + if text: + lid = text2lid(text_model, tokenizer, text) + else: + lid = None + + if lid: + item[self.output_lang_field] = lid + f.write(json.dumps(item, ensure_ascii=False) + '\n') + +class AllVttText(BaseParallelProcessor): + """ + Args: + resampled_audio_dir (str): where to put re-sampled and trimmed wav files. + """ + def __init__( + self, + output_text_field: str, + input_filepath_field: str = "vtt_filepath", + **kwargs, + ): + super().__init__(**kwargs) + self.output_text_field = output_text_field + self.input_filepath_field = input_filepath_field + + def process_dataset_entry(self, data_entry): + vtt_file = data_entry[self.input_filepath_field] + res_list = [DataEntry(data=None)] + if os.path.isfile(vtt_file): + try: + data_entry[self.output_text_field] = get_vtt_text(vtt_file) + res_list = [DataEntry(data=data_entry)] + except Exception as e: + logger.warning("AllVttText " + vtt_file + " " + str(e)) + return res_list + + +class TxtToVtt(BaseParallelProcessor): + """ + Args: + raw_data_dir (str): where to put raw downloaded data. + resampled_audio_dir (str): where to put re-sampled and trimmed wav files. + target_samplerate (int): sample rate to resample to. Defaults to 16000. + target_nchannels (int): target number of channels. Defaults to 1. + """ + def __init__( + self, + vtt_files_dir: str, + key_field: str, + text_field: str, + vtt_field: str, + **kwargs, + ): + super().__init__(**kwargs) + self.vtt_files_dir = vtt_files_dir + self.key_field = key_field + self.text_field = text_field + self.vtt_field = vtt_field + + self.trans_list = make_trans_list() + + def prepare(self): + os.makedirs(self.vtt_files_dir, exist_ok=True) + + def process_dataset_entry(self, data_entry): + key = data_entry[self.key_field] + text_file = data_entry[self.text_field] + os.makedirs(os.path.join(self.vtt_files_dir, key.split("/")[0]), exist_ok=True) + + vtt_file = os.path.join(self.vtt_files_dir, key) + ".vtt" + + txt2vtt(text_file, vtt_file, self.trans_list) + + data_entry[self.vtt_field] = vtt_file + + return [DataEntry(data=data_entry)] + +class ReadParquet(BaseParallelProcessor): + """ + Args: + resampled_audio_dir (str): where to put re-sampled and trimmed wav files. + """ + def __init__( + self, + output_text_field: str, + key_field: str, + raw_data_dir: str, + **kwargs, + ): + super().__init__(**kwargs) + self.output_text_field = output_text_field + self.key_field = key_field + self.raw_data_dir = Path(raw_data_dir) + + def prepare(self): + parquets = [str(self.raw_data_dir / p) for p in self.raw_data_dir.rglob('*.parquet')] + self.urls = None + for parquet in parquets: + df1 = pd.read_parquet(parquet).sort_values("key").set_index("key") + if self.urls is None: + self.urls = df1 + else: + self.urls = pd.concat([self.urls, df1]) + + def process_dataset_entry(self, data_entry): + key = data_entry[self.key_field] + key = key.split("/")[1] + try: + data_entry[self.output_text_field] = self.urls.loc[key]['url'] + except: + data_entry[self.output_text_field] = "NN" + logger.warning("Key: " + key) + return [DataEntry(data=data_entry)] + +class CreateInitialManifestCC(BaseParallelProcessor): + """ + Args: + raw_data_dir (str): where to put raw downloaded data. + resampled_audio_dir (str): where to put re-sampled and trimmed wav files. + target_samplerate (int): sample rate to resample to. Defaults to 16000. + target_nchannels (int): target number of channels. Defaults to 1. + """ + def __init__( + self, + raw_data_dir: str, + resampled_audio_dir: str, + audio_field: str, + video_field: str, + key_field: str, + text_field: str, + target_samplerate: int = 16000, + target_nchannels: int = 1, + **kwargs, + ): + super().__init__(**kwargs) + self.raw_data_dir = Path(raw_data_dir) + self.audio_field = audio_field + self.video_field = video_field + self.key_field = key_field + self.text_field = text_field + self.resampled_audio_dir = resampled_audio_dir + self.target_samplerate = target_samplerate + self.target_nchannels = target_nchannels + + def prepare(self): + os.makedirs(self.raw_data_dir, exist_ok=True) + os.makedirs(self.resampled_audio_dir, exist_ok=True) + + def read_manifest(self): + videos = [str(self.raw_data_dir / video) for video in self.raw_data_dir.rglob('*.jpg')] + texts = [str(self.raw_data_dir / text) for text in self.raw_data_dir.rglob('*.txt')] + v_df = pd.DataFrame({self.video_field: videos}) + t_df = pd.DataFrame({self.text_field: texts }) + v_df[self.key_field] = v_df[self.video_field].apply(lambda x: os.path.splitext(x)[0][-13:]) + t_df[self.key_field] = t_df[self.text_field].apply(lambda x: os.path.splitext(x)[0][-13:]) + v_df = v_df.drop_duplicates(self.key_field) + t_df = t_df.drop_duplicates(self.key_field) + vt_df = v_df.merge(t_df, on=self.key_field, how="left") + return vt_df.values + + def process_dataset_entry(self, data_entry): + (video, key, text) = data_entry + os.makedirs(os.path.join(self.resampled_audio_dir, key.split("/")[0]), exist_ok=True) + audio = os.path.join(self.resampled_audio_dir, key) + ".wav" + ffmpeg_convert(video, audio, self.target_samplerate, self.target_nchannels) + + data = {self.audio_field: audio, + self.key_field: key, + self.text_field: text} + return [DataEntry(data=data)] diff --git a/sdp/processors/datasets/commoncrawl/harv_utils.py b/sdp/processors/datasets/commoncrawl/harv_utils.py new file mode 100644 index 00000000..ebc6f5b1 --- /dev/null +++ b/sdp/processors/datasets/commoncrawl/harv_utils.py @@ -0,0 +1,825 @@ +import os +import torch +# import ffmpeg # pip install ffmpeg-python +import webvtt # pip install webvtt-py +import subprocess, sys +import json, os +import soundfile as sf +from typing import Dict, List, Union +from datetime import datetime +import numpy as np +from pathlib import Path +import pandas as pd +from sdp.logging import logger + + +def read_jsonl(manifest_file): + rec = [] + with open(manifest_file, 'r') as the_file: + for l in the_file: + rec.append(json.loads(l)) + return pd.DataFrame.from_records(rec) + +def write_jsonl(df_in, manifest_filename): + with open(manifest_filename, 'w') as the_file: + for i, x in enumerate(df_in.itertuples()): + r_dict = {} + for column in df_in.columns: + r_dict[column] = getattr(x,column) + l1 = json.dumps(r_dict) + the_file.write(l1+'\n') + +def load_manifest(manifest: Path, keys: List[str] = []) -> List[Dict[str, Union[str, float]]]: + result = [] + r_dict = dict() + for key in keys: + r_dict[key] = list() + + with manifest.open() as f: + for i, line in enumerate(f): + data = json.loads(line) + result.append(data) + for key in keys: + r_dict[key].append(data[key]) + if keys: + return result, r_dict + else: + return result + +def get_vtt_text(vtt_file): + text_all = [] + for caption in webvtt.read(vtt_file): + text = caption.text + if text.find("thumbnails")!=-1: + pass + else: + text_all.append(' '.join(text.split('\n'))) + return ' '.join(text_all) + +def text2lid(text_model, tokenizer, text): + text_langs = "Arabic, Basque, Breton, Catalan, Chinese_China, Chinese_Hongkong, Chinese_Taiwan, Chuvash, Czech, Dhivehi, Dutch, English, Esperanto, Estonian, French, Frisian, Georgian, German, Greek, Hakha_Chin, Indonesian, Interlingua, Italian, Japanese, Kabyle, Kinyarwanda, Kyrgyz, Latvian, Maltese, Mongolian, Persian, Polish, Portuguese, Romanian, Romansh_Sursilvan, Russian, Sakha, Slovenian, Spanish, Swedish, Tamil, Tatar, Turkish, Ukranian, Welsh".split(", ") + inputs = tokenizer(text[:512], return_tensors="pt").to("cuda:0") + with torch.no_grad(): + text_logits = text_model(**inputs).logits + lang_id = text_logits.argmax(1).cpu()[0].numpy() + return text_langs[lang_id] + + +def parse_hours(inp): + inp_list = inp.split(":") + if len(inp_list) == 3 and int(inp_list[0])>=24: + hours = int(inp_list[0])%24 + days = int(inp_list[0])//24 + if days < 31: + inp = str(1+days)+":"+str(hours)+":"+":".join(inp_list[1:]) + return datetime.strptime(inp, '%d:%H:%M:%S.%f') + else: + months = days//31 + days = days%31 + inp = str(1+months)+"/"+str(1+days)+" "+str(hours)+":"+":".join(inp_list[1:]) + return datetime.strptime(inp, '%m/%d %H:%M:%S.%f') + else: + return datetime.strptime(inp, '%H:%M:%S.%f') + +def split_by_vtt(vtt_file, wav_file, wav_save_path): + try: + data, samplerate = sf.read(wav_file) + target_sr = samplerate + if len(data.shape)>1: + data = np.mean(data, axis=1) + _begin = datetime.strptime('00:00:00.000', '%H:%M:%S.%f') + rel_vtt_file = '/'.join(os.path.splitext(vtt_file)[0].split('/')[-2:]) + wav_list, text_list, dur_list = [], [], [] + for caption in webvtt.read(vtt_file): + _start = parse_hours(caption.start) + start = (_start-_begin).total_seconds() + start_sr = int(start*samplerate) + + _end = parse_hours(caption.end) + end = (_end-_begin).total_seconds() + end_sr = int(end*samplerate) + + text = ' '.join(caption.text.split('\n')) + + wav_save_file = os.path.join(wav_save_path, rel_vtt_file, str(int(start*1000))+"-"+str(int(end*1000))+".wav") + os.makedirs(os.path.split(wav_save_file)[0], exist_ok=True) + + # number_of_samples = round(len(data[start_sr:end_sr]) * float(target_sr) / samplerate) + # if number_of_samples > 0: + # if not os.path.exists(wav_save_file): + # data_sample = sps.resample(data[start_sr:end_sr], number_of_samples) + data_sample = data[start_sr:end_sr] + sf.write(wav_save_file, data_sample, target_sr) + text_list.append(text) + wav_list.append(wav_save_file) + dur_list.append(data_sample.shape[0]/samplerate) #(_end-_start).total_seconds() + return wav_list, text_list, dur_list + except Exception as e: + logger.warning(str(e) + vtt_file) + return None, None, None + +def split_by_vtt_new(vtt_file, samplerate): + try: + _begin = datetime.strptime('00:00:00.000', '%H:%M:%S.%f') + text_list, start_s, end_s = [], [], [] + for caption in webvtt.read(vtt_file): + text = ' '.join(caption.text.split('\n')) + + _start = parse_hours(caption.start) + start = (_start-_begin).total_seconds() + start_sr = int(start*samplerate) + + _end = parse_hours(caption.end) + end = (_end-_begin).total_seconds() + end_sr = int(end*samplerate) + + text_list.append(text.strip()) + start_s.append(start_sr) + end_s.append(end_sr) + return text_list, start_s, end_s + except Exception as e: + logger.warning(str(e) + vtt_file) + return None, None, None + +def audio_duration(fname): + data, samplerate = sf.read(fname) + return data.shape[0]/samplerate + +def ffmpeg_convert(jpg: str, wav: str, ar: int = 0, ac: int = 1): + process_args = ["ffmpeg", "-i", jpg, '-ac', str(ac), "-map", "0:a", "-c:a", "pcm_s16le", "-y", wav] + # '-filter_complex', '"[0:a]amerge=inputs=4[a]"', + if ar: + process_args = process_args[:-1] + process_args.extend(["-ar", str(ar), wav]) + return subprocess.run(process_args, stdout = subprocess.DEVNULL, stderr = subprocess.DEVNULL) + +def read_txt(txt_file): + with open(txt_file, "r") as f: + text = f.read() + return text[2:-1].replace("\\n", "\n").replace("\\r", "\r") + +def translate(txt, trans_list): + for trans in trans_list: + txt = txt.replace(trans[0], trans[1]) + return txt + +def txt2vtt(txt_file: str, vtt_file: str, trans_list: List): + txt = read_txt(txt_file) + if txt: + if txt[:6] == "WEBVTT": + pass + else: + txt = "WEBVTT"+txt +# print(f"'{txt[:7]}''") + vtt = translate(txt, trans_list) + with open(vtt_file, "w") as f: + f.write(vtt) + +def make_trans_list(): + t1 = """U+0000   + U+0001 \' \\' + U+0080 \\xc2\\x80 + U+0081 \\xc2\\x81 + U+0082 \\xc2\\x82 + U+0083 \\xc2\\x83 + U+0084 \\xc2\\x84 + U+0085 \\xc2\\x85 + U+0086 \\xc2\\x86 + U+0087 \\xc2\\x87 + U+0088 \\xc2\\x88 + U+0089 \\xc2\\x89 + U+008A \\xc2\\x8a + U+008B \\xc2\\x8b + U+008C \\xc2\\x8c + U+008D \\xc2\\x8d + U+008E \\xc2\\x8e + U+008F \\xc2\\x8f + U+0090 \\xc2\\x90 + U+0091 \\xc2\\x91 + U+0092 \\xc2\\x92 + U+0093 \\xc2\\x93 + U+0094 \\xc2\\x94 + U+0095 \\xc2\\x95 + U+0096 \\xc2\\x96 + U+0097 \\xc2\\x97 + U+0098 \\xc2\\x98 + U+0099 \\xc2\\x99 + U+009A \\xc2\\x9a + U+009B \\xc2\\x9b + U+009C \\xc2\\x9c + U+009D \\xc2\\x9d + U+009E \\xc2\\x9e + U+009F \\xc2\\x9f + U+00A0 \\xc2\\xa0 + U+00A1 ¡ \\xc2\\xa1 + U+00A2 ¢ \\xc2\\xa2 + U+00A3 £ \\xc2\\xa3 + U+00A4 ¤ \\xc2\\xa4 + U+00A5 ¥ \\xc2\\xa5 + U+00A6 ¦ \\xc2\\xa6 + U+00A7 § \\xc2\\xa7 + U+00A8 ¨ \\xc2\\xa8 + U+00A9 © \\xc2\\xa9 + U+00AA ª \\xc2\\xaa + U+00AB « \\xc2\\xab + U+00AC ¬ \\xc2\\xac + U+00AD ­ \\xc2\\xad + U+00AE ® \\xc2\\xae + U+00AF ¯ \\xc2\\xaf + U+00B0 ° \\xc2\\xb0 + U+00B1 ± \\xc2\\xb1 + U+00B2 ² \\xc2\\xb2 + U+00B3 ³ \\xc2\\xb3 + U+00B4 ´ \\xc2\\xb4 + U+00B5 µ \\xc2\\xb5 + U+00B6 ¶ \\xc2\\xb6 + U+00B7 · \\xc2\\xb7 + U+00B8 ¸ \\xc2\\xb8 + U+00B9 ¹ \\xc2\\xb9 + U+00BA º \\xc2\\xba + U+00BB » \\xc2\\xbb + U+00BC ¼ \\xc2\\xbc + U+00BD ½ \\xc2\\xbd + U+00BE ¾ \\xc2\\xbe + U+00BF ¿ \\xc2\\xbf + U+00C0 À \\xc3\\x80 + U+00C1 Á \\xc3\\x81 + U+00C2  \\xc3\\x82 + U+00C3 à \\xc3\\x83 + U+00C4 Ä \\xc3\\x84 + U+00C5 Å \\xc3\\x85 + U+00C6 Æ \\xc3\\x86 + U+00C7 Ç \\xc3\\x87 + U+00C8 È \\xc3\\x88 + U+00C9 É \\xc3\\x89 + U+00CA Ê \\xc3\\x8a + U+00CB Ë \\xc3\\x8b + U+00CC Ì \\xc3\\x8c + U+00CD Í \\xc3\\x8d + U+00CE Î \\xc3\\x8e + U+00CF Ï \\xc3\\x8f + U+00D0 Ð \\xc3\\x90 + U+00D1 Ñ \\xc3\\x91 + U+00D2 Ò \\xc3\\x92 + U+00D3 Ó \\xc3\\x93 + U+00D4 Ô \\xc3\\x94 + U+00D5 Õ \\xc3\\x95 + U+00D6 Ö \\xc3\\x96 + U+00D7 × \\xc3\\x97 + U+00D8 Ø \\xc3\\x98 + U+00D9 Ù \\xc3\\x99 + U+00DA Ú \\xc3\\x9a + U+00DB Û \\xc3\\x9b + U+00DC Ü \\xc3\\x9c + U+00DD Ý \\xc3\\x9d + U+00DE Þ \\xc3\\x9e + U+00DF ß \\xc3\\x9f + U+00E0 à \\xc3\\xa0 + U+00E1 á \\xc3\\xa1 + U+00E2 â \\xc3\\xa2 + U+00E3 ã \\xc3\\xa3 + U+00E4 ä \\xc3\\xa4 + U+00E5 å \\xc3\\xa5 + U+00E6 æ \\xc3\\xa6 + U+00E7 ç \\xc3\\xa7 + U+00E8 è \\xc3\\xa8 + U+00E9 é \\xc3\\xa9 + U+00EA ê \\xc3\\xaa + U+00EB ë \\xc3\\xab + U+00EC ì \\xc3\\xac + U+00ED í \\xc3\\xad + U+00EE î \\xc3\\xae + U+00EF ï \\xc3\\xaf + U+00F0 ð \\xc3\\xb0 + U+00F1 ñ \\xc3\\xb1 + U+00F2 ò \\xc3\\xb2 + U+00F3 ó \\xc3\\xb3 + U+00F4 ô \\xc3\\xb4 + U+00F5 õ \\xc3\\xb5 + U+00F6 ö \\xc3\\xb6 + U+00F7 ÷ \\xc3\\xb7 + U+00F8 ø \\xc3\\xb8 + U+00F9 ù \\xc3\\xb9 + U+00FA ú \\xc3\\xba + U+00FB û \\xc3\\xbb + U+00FC ü \\xc3\\xbc + U+00FD ý \\xc3\\xbd + U+00FE þ \\xc3\\xbe + U+00FF ÿ \\xc3\\xbf + U+0100 Ā \\xc4\\x80 + U+0101 ā \\xc4\\x81 + U+0102 Ă \\xc4\\x82 + U+0103 ă \\xc4\\x83 + U+0104 Ą \\xc4\\x84 + U+0105 ą \\xc4\\x85 + U+0106 Ć \\xc4\\x86 + U+0107 ć \\xc4\\x87 + U+0108 Ĉ \\xc4\\x88 + U+0109 ĉ \\xc4\\x89 + U+010A Ċ \\xc4\\x8a + U+010B ċ \\xc4\\x8b + U+010C Č \\xc4\\x8c + U+010D č \\xc4\\x8d + U+010E Ď \\xc4\\x8e + U+010F ď \\xc4\\x8f + U+0110 Đ \\xc4\\x90 + U+0111 đ \\xc4\\x91 + U+0112 Ē \\xc4\\x92 + U+0113 ē \\xc4\\x93 + U+0114 Ĕ \\xc4\\x94 + U+0115 ĕ \\xc4\\x95 + U+0116 Ė \\xc4\\x96 + U+0117 ė \\xc4\\x97 + U+0118 Ę \\xc4\\x98 + U+0119 ę \\xc4\\x99 + U+011A Ě \\xc4\\x9a + U+011B ě \\xc4\\x9b + U+011C Ĝ \\xc4\\x9c + U+011D ĝ \\xc4\\x9d + U+011E Ğ \\xc4\\x9e + U+011F ğ \\xc4\\x9f + U+0120 Ġ \\xc4\\xa0 + U+0121 ġ \\xc4\\xa1 + U+0122 Ģ \\xc4\\xa2 + U+0123 ģ \\xc4\\xa3 + U+0124 Ĥ \\xc4\\xa4 + U+0125 ĥ \\xc4\\xa5 + U+0126 Ħ \\xc4\\xa6 + U+0127 ħ \\xc4\\xa7 + U+0128 Ĩ \\xc4\\xa8 + U+0129 ĩ \\xc4\\xa9 + U+012A Ī \\xc4\\xaa + U+012B ī \\xc4\\xab + U+012C Ĭ \\xc4\\xac + U+012D ĭ \\xc4\\xad + U+012E Į \\xc4\\xae + U+012F į \\xc4\\xaf + U+0130 İ \\xc4\\xb0 + U+0131 ı \\xc4\\xb1 + U+0132 IJ \\xc4\\xb2 + U+0133 ij \\xc4\\xb3 + U+0134 Ĵ \\xc4\\xb4 + U+0135 ĵ \\xc4\\xb5 + U+0136 Ķ \\xc4\\xb6 + U+0137 ķ \\xc4\\xb7 + U+0138 ĸ \\xc4\\xb8 + U+0139 Ĺ \\xc4\\xb9 + U+013A ĺ \\xc4\\xba + U+013B Ļ \\xc4\\xbb + U+013C ļ \\xc4\\xbc + U+013D Ľ \\xc4\\xbd + U+013E ľ \\xc4\\xbe + U+013F Ŀ \\xc4\\xbf + U+0140 ŀ \\xc5\\x80 + U+0141 Ł \\xc5\\x81 + U+0142 ł \\xc5\\x82 + U+0143 Ń \\xc5\\x83 + U+0144 ń \\xc5\\x84 + U+0145 Ņ \\xc5\\x85 + U+0146 ņ \\xc5\\x86 + U+0147 Ň \\xc5\\x87 + U+0148 ň \\xc5\\x88 + U+0149 ʼn \\xc5\\x89 + U+014A Ŋ \\xc5\\x8a + U+014B ŋ \\xc5\\x8b + U+014C Ō \\xc5\\x8c + U+014D ō \\xc5\\x8d + U+014E Ŏ \\xc5\\x8e + U+014F ŏ \\xc5\\x8f + U+0150 Ő \\xc5\\x90 + U+0151 ő \\xc5\\x91 + U+0152 Œ \\xc5\\x92 + U+0153 œ \\xc5\\x93 + U+0154 Ŕ \\xc5\\x94 + U+0155 ŕ \\xc5\\x95 + U+0156 Ŗ \\xc5\\x96 + U+0157 ŗ \\xc5\\x97 + U+0158 Ř \\xc5\\x98 + U+0159 ř \\xc5\\x99 + U+015A Ś \\xc5\\x9a + U+015B ś \\xc5\\x9b + U+015C Ŝ \\xc5\\x9c + U+015D ŝ \\xc5\\x9d + U+015E Ş \\xc5\\x9e + U+015F ş \\xc5\\x9f + U+0160 Š \\xc5\\xa0 + U+0161 š \\xc5\\xa1 + U+0162 Ţ \\xc5\\xa2 + U+0163 ţ \\xc5\\xa3 + U+0164 Ť \\xc5\\xa4 + U+0165 ť \\xc5\\xa5 + U+0166 Ŧ \\xc5\\xa6 + U+0167 ŧ \\xc5\\xa7 + U+0168 Ũ \\xc5\\xa8 + U+0169 ũ \\xc5\\xa9 + U+016A Ū \\xc5\\xaa + U+016B ū \\xc5\\xab + U+016C Ŭ \\xc5\\xac + U+016D ŭ \\xc5\\xad + U+016E Ů \\xc5\\xae + U+016F ů \\xc5\\xaf + U+0170 Ű \\xc5\\xb0 + U+0171 ű \\xc5\\xb1 + U+0172 Ų \\xc5\\xb2 + U+0173 ų \\xc5\\xb3 + U+0174 Ŵ \\xc5\\xb4 + U+0175 ŵ \\xc5\\xb5 + U+0176 Ŷ \\xc5\\xb6 + U+0177 ŷ \\xc5\\xb7 + U+0178 Ÿ \\xc5\\xb8 + U+0179 Ź \\xc5\\xb9 + U+017A ź \\xc5\\xba + U+017B Ż \\xc5\\xbb + U+017C ż \\xc5\\xbc + U+017D Ž \\xc5\\xbd + U+017E ž \\xc5\\xbe + U+017F ſ \\xc5\\xbf + U+0180 ƀ \\xc6\\x80 + U+0181 Ɓ \\xc6\\x81 + U+0182 Ƃ \\xc6\\x82 + U+0183 ƃ \\xc6\\x83 + U+0184 Ƅ \\xc6\\x84 + U+0185 ƅ \\xc6\\x85 + U+0186 Ɔ \\xc6\\x86 + U+0187 Ƈ \\xc6\\x87 + U+0188 ƈ \\xc6\\x88 + U+0189 Ɖ \\xc6\\x89 + U+018A Ɗ \\xc6\\x8a + U+018B Ƌ \\xc6\\x8b + U+018C ƌ \\xc6\\x8c + U+018D ƍ \\xc6\\x8d + U+018E Ǝ \\xc6\\x8e + U+018F Ə \\xc6\\x8f + U+0190 Ɛ \\xc6\\x90 + U+0191 Ƒ \\xc6\\x91 + U+0192 ƒ \\xc6\\x92 + U+0193 Ɠ \\xc6\\x93 + U+0194 Ɣ \\xc6\\x94 + U+0195 ƕ \\xc6\\x95 + U+0196 Ɩ \\xc6\\x96 + U+0197 Ɨ \\xc6\\x97 + U+0198 Ƙ \\xc6\\x98 + U+0199 ƙ \\xc6\\x99 + U+019A ƚ \\xc6\\x9a + U+019B ƛ \\xc6\\x9b + U+019C Ɯ \\xc6\\x9c + U+019D Ɲ \\xc6\\x9d + U+019E ƞ \\xc6\\x9e + U+019F Ɵ \\xc6\\x9f + U+01A0 Ơ \\xc6\\xa0 + U+01A1 ơ \\xc6\\xa1 + U+01A2 Ƣ \\xc6\\xa2 + U+01A3 ƣ \\xc6\\xa3 + U+01A4 Ƥ \\xc6\\xa4 + U+01A5 ƥ \\xc6\\xa5 + U+01A6 Ʀ \\xc6\\xa6 + U+01A7 Ƨ \\xc6\\xa7 + U+01A8 ƨ \\xc6\\xa8 + U+01A9 Ʃ \\xc6\\xa9 + U+01AA ƪ \\xc6\\xaa + U+01AB ƫ \\xc6\\xab + U+01AC Ƭ \\xc6\\xac + U+01AD ƭ \\xc6\\xad + U+01AE Ʈ \\xc6\\xae + U+01AF Ư \\xc6\\xaf + U+01B0 ư \\xc6\\xb0 + U+01B1 Ʊ \\xc6\\xb1 + U+01B2 Ʋ \\xc6\\xb2 + U+01B3 Ƴ \\xc6\\xb3 + U+01B4 ƴ \\xc6\\xb4 + U+01B5 Ƶ \\xc6\\xb5 + U+01B6 ƶ \\xc6\\xb6 + U+01B7 Ʒ \\xc6\\xb7 + U+01B8 Ƹ \\xc6\\xb8 + U+01B9 ƹ \\xc6\\xb9 + U+01BA ƺ \\xc6\\xba + U+01BB ƻ \\xc6\\xbb + U+01BC Ƽ \\xc6\\xbc + U+01BD ƽ \\xc6\\xbd + U+01BE ƾ \\xc6\\xbe + U+01BF ƿ \\xc6\\xbf + U+01C0 ǀ \\xc7\\x80 + U+01C1 ǁ \\xc7\\x81 + U+01C2 ǂ \\xc7\\x82 + U+01C3 ǃ \\xc7\\x83 + U+01C4 DŽ \\xc7\\x84 + U+01C5 Dž \\xc7\\x85 + U+01C6 dž \\xc7\\x86 + U+01C7 LJ \\xc7\\x87 + U+01C8 Lj \\xc7\\x88 + U+01C9 lj \\xc7\\x89 + U+01CA NJ \\xc7\\x8a + U+01CB Nj \\xc7\\x8b + U+01CC nj \\xc7\\x8c + U+01CD Ǎ \\xc7\\x8d + U+01CE ǎ \\xc7\\x8e + U+01CF Ǐ \\xc7\\x8f + U+01D0 ǐ \\xc7\\x90 + U+01D1 Ǒ \\xc7\\x91 + U+01D2 ǒ \\xc7\\x92 + U+01D3 Ǔ \\xc7\\x93 + U+01D4 ǔ \\xc7\\x94 + U+01D5 Ǖ \\xc7\\x95 + U+01D6 ǖ \\xc7\\x96 + U+01D7 Ǘ \\xc7\\x97 + U+01D8 ǘ \\xc7\\x98 + U+01D9 Ǚ \\xc7\\x99 + U+01DA ǚ \\xc7\\x9a + U+01DB Ǜ \\xc7\\x9b + U+01DC ǜ \\xc7\\x9c + U+01DD ǝ \\xc7\\x9d + U+01DE Ǟ \\xc7\\x9e + U+01DF ǟ \\xc7\\x9f + U+01E0 Ǡ \\xc7\\xa0 + U+01E1 ǡ \\xc7\\xa1 + U+01E2 Ǣ \\xc7\\xa2 + U+01E3 ǣ \\xc7\\xa3 + U+01E4 Ǥ \\xc7\\xa4 + U+01E5 ǥ \\xc7\\xa5 + U+01E6 Ǧ \\xc7\\xa6 + U+01E7 ǧ \\xc7\\xa7 + U+01E8 Ǩ \\xc7\\xa8 + U+01E9 ǩ \\xc7\\xa9 + U+01EA Ǫ \\xc7\\xaa + U+01EB ǫ \\xc7\\xab + U+01EC Ǭ \\xc7\\xac + U+01ED ǭ \\xc7\\xad + U+01EE Ǯ \\xc7\\xae + U+01EF ǯ \\xc7\\xaf + U+01F0 ǰ \\xc7\\xb0 + U+01F1 DZ \\xc7\\xb1 + U+01F2 Dz \\xc7\\xb2 + U+01F3 dz \\xc7\\xb3 + U+01F4 Ǵ \\xc7\\xb4 + U+01F5 ǵ \\xc7\\xb5 + U+01F6 Ƕ \\xc7\\xb6 + U+01F7 Ƿ \\xc7\\xb7 + U+01F8 Ǹ \\xc7\\xb8 + U+01F9 ǹ \\xc7\\xb9 + U+01FA Ǻ \\xc7\\xba + U+01FB ǻ \\xc7\\xbb + U+01FC Ǽ \\xc7\\xbc + U+01FD ǽ \\xc7\\xbd + U+01FE Ǿ \\xc7\\xbe + U+01FF ǿ \\xc7\\xbf + U+2000   \\xe2\\x80\\x80 EN QUAD + U+2001   \\xe2\\x80\\x81 EM QUAD + U+2002   \\xe2\\x80\\x82 EN SPACE + U+2003   \\xe2\\x80\\x83 EM SPACE + U+2004   \\xe2\\x80\\x84 THREE-PER-EM SPACE + U+2005   \\xe2\\x80\\x85 FOUR-PER-EM SPACE + U+2006   \\xe2\\x80\\x86 SIX-PER-EM SPACE + U+2007   \\xe2\\x80\\x87 FIGURE SPACE + U+2008   \\xe2\\x80\\x88 PUNCTUATION SPACE + U+2009   \\xe2\\x80\\x89 THIN SPACE + U+200A   \\xe2\\x80\\x8a HAIR SPACE + U+200B ​ \\xe2\\x80\\x8b ZERO WIDTH SPACE + U+200C ‌ \\xe2\\x80\\x8c ZERO WIDTH NON-JOINER + U+200D ‍ \\xe2\\x80\\x8d ZERO WIDTH JOINER + U+200E ‎ \\xe2\\x80\\x8e LEFT-TO-RIGHT MARK + U+200F ‏ \\xe2\\x80\\x8f RIGHT-TO-LEFT MARK + U+2010 ‐ \\xe2\\x80\\x90 HYPHEN + U+2011 ‑ \\xe2\\x80\\x91 NON-BREAKING HYPHEN + U+2012 ‒ \\xe2\\x80\\x92 FIGURE DASH + U+2013 – \\xe2\\x80\\x93 EN DASH + U+2014 — \\xe2\\x80\\x94 EM DASH + U+2015 ― \\xe2\\x80\\x95 HORIZONTAL BAR + U+2016 ‖ \\xe2\\x80\\x96 DOUBLE VERTICAL LINE + U+2017 ‗ \\xe2\\x80\\x97 DOUBLE LOW LINE + U+2018 ‘ \\xe2\\x80\\x98 LEFT SINGLE QUOTATION MARK + U+2019 ’ \\xe2\\x80\\x99 RIGHT SINGLE QUOTATION MARK + U+201A ‚ \\xe2\\x80\\x9a SINGLE LOW-9 QUOTATION MARK + U+201B ‛ \\xe2\\x80\\x9b SINGLE HIGH-REVERSED-9 QUOTATION MARK + U+201C “ \\xe2\\x80\\x9c LEFT DOUBLE QUOTATION MARK + U+201D ” \\xe2\\x80\\x9d RIGHT DOUBLE QUOTATION MARK + U+201E „ \\xe2\\x80\\x9e DOUBLE LOW-9 QUOTATION MARK + U+201F ‟ \\xe2\\x80\\x9f DOUBLE HIGH-REVERSED-9 QUOTATION MARK + U+2020 † \\xe2\\x80\\xa0 DAGGER + U+2021 ‡ \\xe2\\x80\\xa1 DOUBLE DAGGER + U+2022 • \\xe2\\x80\\xa2 BULLET + U+2023 ‣ \\xe2\\x80\\xa3 TRIANGULAR BULLET + U+2024 ․ \\xe2\\x80\\xa4 ONE DOT LEADER + U+2025 ‥ \\xe2\\x80\\xa5 TWO DOT LEADER + U+2026 … \\xe2\\x80\\xa6 HORIZONTAL ELLIPSIS + U+2027 ‧ \\xe2\\x80\\xa7 HYPHENATION POINT + U+2028 \\xe2\\x80\\xa8 LINE SEPARATOR + U+2029 \\xe2\\x80\\xa9 PARAGRAPH SEPARATOR + U+202A ‪ \\xe2\\x80\\xaa LEFT-TO-RIGHT EMBEDDING + U+202B ‫ \\xe2\\x80\\xab RIGHT-TO-LEFT EMBEDDING + U+202C ‬ \\xe2\\x80\\xac POP DIRECTIONAL FORMATTING + U+202D ‭ \\xe2\\x80\\xad LEFT-TO-RIGHT OVERRIDE + U+202E ‮ \\xe2\\x80\\xae RIGHT-TO-LEFT OVERRIDE + U+202F   \\xe2\\x80\\xaf NARROW NO-BREAK SPACE + U+2030 ‰ \\xe2\\x80\\xb0 PER MILLE SIGN + U+2031 ‱ \\xe2\\x80\\xb1 PER TEN THOUSAND SIGN + U+2032 ′ \\xe2\\x80\\xb2 PRIME + U+2033 ″ \\xe2\\x80\\xb3 DOUBLE PRIME + U+2034 ‴ \\xe2\\x80\\xb4 TRIPLE PRIME + U+2035 ‵ \\xe2\\x80\\xb5 REVERSED PRIME + U+2036 ‶ \\xe2\\x80\\xb6 REVERSED DOUBLE PRIME + U+2037 ‷ \\xe2\\x80\\xb7 REVERSED TRIPLE PRIME + U+2038 ‸ \\xe2\\x80\\xb8 CARET + U+2039 ‹ \\xe2\\x80\\xb9 SINGLE LEFT-POINTING ANGLE QUOTATION MARK + U+203A › \\xe2\\x80\\xba SINGLE RIGHT-POINTING ANGLE QUOTATION MARK + U+203B ※ \\xe2\\x80\\xbb REFERENCE MARK + U+203C ‼ \\xe2\\x80\\xbc DOUBLE EXCLAMATION MARK + U+203D ‽ \\xe2\\x80\\xbd INTERROBANG + U+203E ‾ \\xe2\\x80\\xbe OVERLINE + U+203F ‿ \\xe2\\x80\\xbf UNDERTIE + U+2040 ⁀ \\xe2\\x81\\x80 CHARACTER TIE + U+2041 ⁁ \\xe2\\x81\\x81 CARET INSERTION POINT + U+2042 ⁂ \\xe2\\x81\\x82 ASTERISM + U+2043 ⁃ \\xe2\\x81\\x83 HYPHEN BULLET + U+2044 ⁄ \\xe2\\x81\\x84 FRACTION SLASH + U+2045 ⁅ \\xe2\\x81\\x85 LEFT SQUARE BRACKET WITH QUILL + U+2046 ⁆ \\xe2\\x81\\x86 RIGHT SQUARE BRACKET WITH QUILL + U+2047 ⁇ \\xe2\\x81\\x87 DOUBLE QUESTION MARK + U+2048 ⁈ \\xe2\\x81\\x88 QUESTION EXCLAMATION MARK + U+2049 ⁉ \\xe2\\x81\\x89 EXCLAMATION QUESTION MARK + U+204A ⁊ \\xe2\\x81\\x8a TIRONIAN SIGN ET + U+204B ⁋ \\xe2\\x81\\x8b REVERSED PILCROW SIGN + U+204C ⁌ \\xe2\\x81\\x8c BLACK LEFTWARDS BULLET + U+204D ⁍ \\xe2\\x81\\x8d BLACK RIGHTWARDS BULLET + U+204E ⁎ \\xe2\\x81\\x8e LOW ASTERISK + U+204F ⁏ \\xe2\\x81\\x8f REVERSED SEMICOLON + U+2050 ⁐ \\xe2\\x81\\x90 CLOSE UP + U+2051 ⁑ \\xe2\\x81\\x91 TWO ASTERISKS ALIGNED VERTICALLY + U+2052 ⁒ \\xe2\\x81\\x92 COMMERCIAL MINUS SIGN + U+2053 ⁓ \\xe2\\x81\\x93 SWUNG DASH + U+2054 ⁔ \\xe2\\x81\\x94 INVERTED UNDERTIE + U+2055 ⁕ \\xe2\\x81\\x95 FLOWER PUNCTUATION MARK + U+2056 ⁖ \\xe2\\x81\\x96 THREE DOT PUNCTUATION + U+2057 ⁗ \\xe2\\x81\\x97 QUADRUPLE PRIME + U+2058 ⁘ \\xe2\\x81\\x98 FOUR DOT PUNCTUATION + U+2059 ⁙ \\xe2\\x81\\x99 FIVE DOT PUNCTUATION + U+205A ⁚ \\xe2\\x81\\x9a TWO DOT PUNCTUATION + U+205B ⁛ \\xe2\\x81\\x9b FOUR DOT MARK + U+205C ⁜ \\xe2\\x81\\x9c DOTTED CROSS + U+205D ⁝ \\xe2\\x81\\x9d TRICOLON + U+205E ⁞ \\xe2\\x81\\x9e VERTICAL FOUR DOTS + U+205F   \\xe2\\x81\\x9f MEDIUM MATHEMATICAL SPACE + U+2060 ⁠ \\xe2\\x81\\xa0 WORD JOINER + U+2061 ⁡ \\xe2\\x81\\xa1 FUNCTION APPLICATION + U+2062 ⁢ \\xe2\\x81\\xa2 INVISIBLE TIMES + U+2063 ⁣ \\xe2\\x81\\xa3 INVISIBLE SEPARATOR + U+2064 ⁤ \\xe2\\x81\\xa4 INVISIBLE PLUS + U+2065 ⁥ \\xe2\\x81\\xa5 + U+2066 ⁦ \\xe2\\x81\\xa6 LEFT-TO-RIGHT ISOLATE + U+2067 ⁧ \\xe2\\x81\\xa7 RIGHT-TO-LEFT ISOLATE + U+2068 ⁨ \\xe2\\x81\\xa8 FIRST STRONG ISOLATE + U+2069 ⁩ \\xe2\\x81\\xa9 POP DIRECTIONAL ISOLATE + U+206A  \\xe2\\x81\\xaa INHIBIT SYMMETRIC SWAPPING + U+206B  \\xe2\\x81\\xab ACTIVATE SYMMETRIC SWAPPING + U+206C  \\xe2\\x81\\xac INHIBIT ARABIC FORM SHAPING + U+206D  \\xe2\\x81\\xad ACTIVATE ARABIC FORM SHAPING + U+206E  \\xe2\\x81\\xae NATIONAL DIGIT SHAPES + U+206F  \\xe2\\x81\\xaf NOMINAL DIGIT SHAPES + U+2070 ⁰ \\xe2\\x81\\xb0 SUPERSCRIPT ZERO + U+2071 ⁱ \\xe2\\x81\\xb1 SUPERSCRIPT LATIN SMALL LETTER I + U+2072 ⁲ \\xe2\\x81\\xb2 + U+2073 ⁳ \\xe2\\x81\\xb3 + U+2074 ⁴ \\xe2\\x81\\xb4 SUPERSCRIPT FOUR + U+2075 ⁵ \\xe2\\x81\\xb5 SUPERSCRIPT FIVE + U+2076 ⁶ \\xe2\\x81\\xb6 SUPERSCRIPT SIX + U+2077 ⁷ \\xe2\\x81\\xb7 SUPERSCRIPT SEVEN + U+2078 ⁸ \\xe2\\x81\\xb8 SUPERSCRIPT EIGHT + U+2079 ⁹ \\xe2\\x81\\xb9 SUPERSCRIPT NINE + U+207A ⁺ \\xe2\\x81\\xba SUPERSCRIPT PLUS SIGN + U+207B ⁻ \\xe2\\x81\\xbb SUPERSCRIPT MINUS + U+207C ⁼ \\xe2\\x81\\xbc SUPERSCRIPT EQUALS SIGN + U+207D ⁽ \\xe2\\x81\\xbd SUPERSCRIPT LEFT PARENTHESIS + U+207E ⁾ \\xe2\\x81\\xbe SUPERSCRIPT RIGHT PARENTHESIS + U+207F ⁿ \\xe2\\x81\\xbf SUPERSCRIPT LATIN SMALL LETTER N + U+2580 ▀ \\xe2\\x96\\x80 + U+2581 ▁ \\xe2\\x96\\x81 + U+2582 ▂ \\xe2\\x96\\x82 + U+2583 ▃ \\xe2\\x96\\x83 + U+2584 ▄ \\xe2\\x96\\x84 + U+2585 ▅ \\xe2\\x96\\x85 + U+2586 ▆ \\xe2\\x96\\x86 + U+2587 ▇ \\xe2\\x96\\x87 + U+2588 █ \\xe2\\x96\\x88 + U+2589 ▉ \\xe2\\x96\\x89 + U+258A ▊ \\xe2\\x96\\x8a + U+258B ▋ \\xe2\\x96\\x8b + U+258C ▌ \\xe2\\x96\\x8c + U+258D ▍ \\xe2\\x96\\x8d + U+258E ▎ \\xe2\\x96\\x8e + U+258F ▏ \\xe2\\x96\\x8f + U+2590 ▐ \\xe2\\x96\\x90 + U+2591 ░ \\xe2\\x96\\x91 + U+2592 ▒ \\xe2\\x96\\x92 + U+2593 ▓ \\xe2\\x96\\x93 + U+2594 ▔ \\xe2\\x96\\x94 + U+2595 ▕ \\xe2\\x96\\x95 + U+2596 ▖ \\xe2\\x96\\x96 + U+2597 ▗ \\xe2\\x96\\x97 + U+2598 ▘ \\xe2\\x96\\x98 + U+2599 ▙ \\xe2\\x96\\x99 + U+259A ▚ \\xe2\\x96\\x9a + U+259B ▛ \\xe2\\x96\\x9b + U+259C ▜ \\xe2\\x96\\x9c + U+259D ▝ \\xe2\\x96\\x9d + U+259E ▞ \\xe2\\x96\\x9e + U+259F ▟ \\xe2\\x96\\x9f + U+25A0 ■ \\xe2\\x96\\xa0 + U+25A1 □ \\xe2\\x96\\xa1 + U+25A2 ▢ \\xe2\\x96\\xa2 + U+25A3 ▣ \\xe2\\x96\\xa3 + U+25A4 ▤ \\xe2\\x96\\xa4 + U+25A5 ▥ \\xe2\\x96\\xa5 + U+25A6 ▦ \\xe2\\x96\\xa6 + U+25A7 ▧ \\xe2\\x96\\xa7 + U+25A8 ▨ \\xe2\\x96\\xa8 + U+25A9 ▩ \\xe2\\x96\\xa9 + U+25AA ▪ \\xe2\\x96\\xaa + U+25AB ▫ \\xe2\\x96\\xab + U+25AC ▬ \\xe2\\x96\\xac + U+25AD ▭ \\xe2\\x96\\xad + U+25AE ▮ \\xe2\\x96\\xae + U+25AF ▯ \\xe2\\x96\\xaf + U+25B0 ▰ \\xe2\\x96\\xb0 + U+25B1 ▱ \\xe2\\x96\\xb1 + U+25B2 ▲ \\xe2\\x96\\xb2 + U+25B3 △ \\xe2\\x96\\xb3 + U+25B4 ▴ \\xe2\\x96\\xb4 + U+25B5 ▵ \\xe2\\x96\\xb5 + U+25B6 ▶ \\xe2\\x96\\xb6 + U+25B7 ▷ \\xe2\\x96\\xb7 + U+25B8 ▸ \\xe2\\x96\\xb8 + U+25B9 ▹ \\xe2\\x96\\xb9 + U+25BA ► \\xe2\\x96\\xba + U+25BB ▻ \\xe2\\x96\\xbb + U+25BC ▼ \\xe2\\x96\\xbc + U+25BD ▽ \\xe2\\x96\\xbd + U+25BE ▾ \\xe2\\x96\\xbe + U+25BF ▿ \\xe2\\x96\\xbf + U+25C0 ◀ \\xe2\\x97\\x80 + U+25C1 ◁ \\xe2\\x97\\x81 + U+25C2 ◂ \\xe2\\x97\\x82 + U+25C3 ◃ \\xe2\\x97\\x83 + U+25C4 ◄ \\xe2\\x97\\x84 + U+25C5 ◅ \\xe2\\x97\\x85 + U+25C6 ◆ \\xe2\\x97\\x86 + U+25C7 ◇ \\xe2\\x97\\x87 + U+25C8 ◈ \\xe2\\x97\\x88 + U+25C9 ◉ \\xe2\\x97\\x89 + U+25CA ◊ \\xe2\\x97\\x8a + U+25CB ○ \\xe2\\x97\\x8b + U+25CC ◌ \\xe2\\x97\\x8c + U+25CD ◍ \\xe2\\x97\\x8d + U+25CE ◎ \\xe2\\x97\\x8e + U+25CF ● \\xe2\\x97\\x8f + U+25D0 ◐ \\xe2\\x97\\x90 + U+25D1 ◑ \\xe2\\x97\\x91 + U+25D2 ◒ \\xe2\\x97\\x92 + U+25D3 ◓ \\xe2\\x97\\x93 + U+25D4 ◔ \\xe2\\x97\\x94 + U+25D5 ◕ \\xe2\\x97\\x95 + U+25D6 ◖ \\xe2\\x97\\x96 + U+25D7 ◗ \\xe2\\x97\\x97 + U+25D8 ◘ \\xe2\\x97\\x98 + U+25D9 ◙ \\xe2\\x97\\x99 + U+25DA ◚ \\xe2\\x97\\x9a + U+25DB ◛ \\xe2\\x97\\x9b + U+25DC ◜ \\xe2\\x97\\x9c + U+25DD ◝ \\xe2\\x97\\x9d + U+25DE ◞ \\xe2\\x97\\x9e + U+25DF ◟ \\xe2\\x97\\x9f + U+25E0 ◠ \\xe2\\x97\\xa0 + U+25E1 ◡ \\xe2\\x97\\xa1 + U+25E2 ◢ \\xe2\\x97\\xa2 + U+25E3 ◣ \\xe2\\x97\\xa3 + U+25E4 ◤ \\xe2\\x97\\xa4 + U+25E5 ◥ \\xe2\\x97\\xa5 + U+25E6 ◦ \\xe2\\x97\\xa6 + U+25E7 ◧ \\xe2\\x97\\xa7 + U+25E8 ◨ \\xe2\\x97\\xa8 + U+25E9 ◩ \\xe2\\x97\\xa9 + U+25EA ◪ \\xe2\\x97\\xaa + U+25EB ◫ \\xe2\\x97\\xab + U+25EC ◬ \\xe2\\x97\\xac + U+25ED ◭ \\xe2\\x97\\xad + U+25EE ◮ \\xe2\\x97\\xae + U+25EF ◯ \\xe2\\x97\\xaf + U+25F0 ◰ \\xe2\\x97\\xb0 + U+25F1 ◱ \\xe2\\x97\\xb1 + U+25F2 ◲ \\xe2\\x97\\xb2 + U+25F3 ◳ \\xe2\\x97\\xb3 + U+25F4 ◴ \\xe2\\x97\\xb4 + U+25F5 ◵ \\xe2\\x97\\xb5 + U+25F6 ◶ \\xe2\\x97\\xb6 + U+25F7 ◷ \\xe2\\x97\\xb7 + U+25F8 ◸ \\xe2\\x97\\xb8 + U+25F9 ◹ \\xe2\\x97\\xb9 + U+25FA ◺ \\xe2\\x97\\xba + U+25FB ◻ \\xe2\\x97\\xbb + U+25FC ◼ \\xe2\\x97\\xbc + U+25FD ◽ \\xe2\\x97\\xbd + U+25FE ◾ \\xe2\\x97\\xbe + U+25FF ◿ \\xe2\\x97\\xbf""" + trans_list = [] + for a in t1.split('\n'): + b = a.split("\t") + trans_list.append((b[2],b[1])) + return trans_list diff --git a/sdp/processors/datasets/commoncrawl/requirements.txt b/sdp/processors/datasets/commoncrawl/requirements.txt new file mode 100644 index 00000000..39d03091 --- /dev/null +++ b/sdp/processors/datasets/commoncrawl/requirements.txt @@ -0,0 +1,7 @@ +sacrebleu +ffmpeg-python +webvtt-py +fastparquet +pysndfile # conda install -c conda-forge libsndfile==1.0.31 +sonar-space +fairseq2 From c4ea5e467edd96f6b5a7e89f6348ef2f95bcf2a6 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Tue, 12 Sep 2023 07:36:37 -0700 Subject: [PATCH 002/115] batch Signed-off-by: Nikolay Karpov --- sdp/processors/datasets/commoncrawl/commoncrawl.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py index c63309bd..c25f1aab 100644 --- a/sdp/processors/datasets/commoncrawl/commoncrawl.py +++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py @@ -11,7 +11,7 @@ from sdp.processors.base_processor import BaseProcessor, BaseParallelProcessor, DataEntry from sdp.logging import logger -from sdp.processors.datasets.commoncrawl import ffmpeg_convert, txt2vtt, make_trans_list, get_vtt_text, text2lid, load_manifest, read_jsonl, write_jsonl, split_by_vtt_new +from sdp.processors.datasets.commoncrawl.harv_utils import ffmpeg_convert, txt2vtt, make_trans_list, get_vtt_text, text2lid, load_manifest, read_jsonl, write_jsonl, split_by_vtt_new class UseSonar(BaseProcessor): """ @@ -25,6 +25,7 @@ def __init__( output_field: str, speech_encoder_model: str, text_encoder_model: str, + batch_size: int = 64, device: str = "cuda", **kwargs, ): @@ -41,6 +42,7 @@ def __init__( self.output_field = output_field self.input_text_field = input_text_field self.input_audio_field = input_audio_field + self.batch_size = batch_size self.device = device self.text_encoder_model = load_sonar_text_encoder_model(text_encoder_model, device=self.device).eval() self.text_tokenizer = load_sonar_tokenizer(text_encoder_model) @@ -56,19 +58,16 @@ def process(self): manifest, dir_list = load_manifest(Path(self.input_manifest_file), keys = [self.input_audio_field, self.input_text_field]) text_emb = text_embedding_pipeline.predict(input = dir_list[self.input_text_field], - batch_size = 64, + batch_size = self.batch_size, source_lang="eng_Latn") - print("text_emb", type(text_emb), text_emb) audio_emb = s2vec_model.predict(input = dir_list[self.input_audio_field], - batch_size = 64, + batch_size = self.batch_size, n_parallel = 20, pad_idx = 0, n_prefetched_batches = 2,) - print("audio_emb", type(audio_emb), audio_emb) pdist = self.pdist(text_emb, audio_emb).numpy().astype(float) - print("pdist", pdist) Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True) assert(len(manifest)==len(pdist)) From 4ebc195f42cdef5196fbd5a9d4d621b3430e4f36 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Tue, 12 Sep 2023 07:37:24 -0700 Subject: [PATCH 003/115] rm filter Signed-off-by: Nikolay Karpov --- dataset_configs/commoncrawl/small_de.yaml | 72 ++++--------- dataset_configs/commoncrawl/small_de_en.yaml | 66 +++--------- dataset_configs/commoncrawl/small_en.yaml | 93 ++++------------- dataset_configs/commoncrawl/small_es.yaml | 91 ++++++---------- dataset_configs/commoncrawl/small_fr.yaml | 75 +++----------- dataset_configs/commoncrawl/small_pl.yaml | 103 +++++-------------- 6 files changed, 121 insertions(+), 379 deletions(-) diff --git a/dataset_configs/commoncrawl/small_de.yaml b/dataset_configs/commoncrawl/small_de.yaml index ce8b9d27..cd127fc1 100644 --- a/dataset_configs/commoncrawl/small_de.yaml +++ b/dataset_configs/commoncrawl/small_de.yaml @@ -17,52 +17,16 @@ processors: output_manifest_file: ${workspace_dir}/manifest2.json pretrained_model: nvidia/stt_de_fastconformer_hybrid_large_pc batch_size: 64 - - - _target_: sdp.processors.DropIfRegexMatch - output_manifest_file: ${workspace_dir}/manifest3.json - max_workers: 20 - regex_patterns: - # - '://' - # - '\\x' - - 'é' - - 'ô' - - '×' - - 'š' - - '\u202a' - - 'č' - - 'ć' - - 'á' - - 'ã' - - 'â' - - 'ï' - - '\u2060' - - 'ñ' - - 'ŵ' - - 'à' - - 'ù' - - 'ò' - - 'ó' - - 'ő' - - 'ê' - - 'ă' - - 'ú' - - 'µ' - - '¿' - - 'ë' - - "è" - - "é" - - "È" - - "É" - - "%" - - "¡" - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest4.json + input_manifest_file: ${workspace_dir}/manifest2.json + output_manifest_file: ${workspace_dir}/manifest3.json duplicate_fields: {"text":"orig_text"} - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest5.json + output_manifest_file: ${workspace_dir}/manifest4.json regex_params_list: + - {"pattern": '\[(.*?)\]', "repl": ' '} - {"pattern": 'î', "repl": "i"} - {"pattern": 'ì', "repl": "i"} - {"pattern": 'í', "repl": "i"} @@ -88,12 +52,12 @@ processors: - {"pattern": " ", "repl": " "} - _target_: sdp.processors.DropHighLowWordrate - output_manifest_file: ${workspace_dir}/manifest6.json + output_manifest_file: ${workspace_dir}/manifest5.json high_wordrate_threshold: 100 low_wordrate_threshold: 0.01 - _target_: sdp.processors.datasets.commoncrawl.Subprocess - output_manifest_file: ${workspace_dir}/manifest7.json + output_manifest_file: ${workspace_dir}/manifest6.json input_manifest_arg: "--manifest" output_manifest_arg: "--output_filename" arg_separator: "=" @@ -102,11 +66,11 @@ processors: --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/de/data/whitelist.tsv" - _target_: sdp.processors.RenameFields - output_manifest_file: ${workspace_dir}/manifest8.json + output_manifest_file: ${workspace_dir}/manifest7.json rename_fields: {"normalized":"text"} - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest9.json + output_manifest_file: ${workspace_dir}/manifest8.json text_key: text regex_params_list: - {"pattern": '\\.{3}', "repl": '.'} @@ -116,20 +80,20 @@ processors: - {"pattern": ' ', "repl": " "} - _target_: sdp.processors.DropHighLowWordrate - output_manifest_file: ${workspace_dir}/manifest10.json + output_manifest_file: ${workspace_dir}/manifest9.json high_wordrate_threshold: 100 low_wordrate_threshold: 0.01 - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest11.json + output_manifest_file: ${workspace_dir}/manifest10.json duplicate_fields: {"text":"text_pc"} - _target_: sdp.processors.SubMakeLowercase - output_manifest_file: ${workspace_dir}/manifest12.json + output_manifest_file: ${workspace_dir}/manifest11.json text_key: text - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest13.json + output_manifest_file: ${workspace_dir}/manifest12.json text_key: text regex_params_list: - {"pattern": "[\\?\\.\\!]", "repl": " "} @@ -137,15 +101,15 @@ processors: - {"pattern": " ", "repl": " "} - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest14.json + output_manifest_file: ${workspace_dir}/manifest13.json duplicate_fields: {"pred_text":"pred_text_pc"} - _target_: sdp.processors.SubMakeLowercase - output_manifest_file: ${workspace_dir}/manifest15.json + output_manifest_file: ${workspace_dir}/manifest14.json text_key: pred_text - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest16.json + output_manifest_file: ${workspace_dir}/manifest15.json text_key: pred_text regex_params_list: - {"pattern": "[\\?\\.\\!]", "repl": " "} @@ -153,19 +117,19 @@ processors: - {"pattern": " ", "repl": " "} - _target_: sdp.processors.DropIfRegexMatch - output_manifest_file: ${workspace_dir}/manifest17.json + output_manifest_file: ${workspace_dir}/manifest16.json text_key: text regex_patterns: - "^\\s*$" - _target_: sdp.processors.DropHighWER - output_manifest_file: ${workspace_dir}/manifest18.json + output_manifest_file: ${workspace_dir}/manifest17.json text_key: text pred_text_key: pred_text wer_threshold: 75 - _target_: sdp.processors.DropHighCER - output_manifest_file: ${workspace_dir}/manifest19.json + output_manifest_file: ${workspace_dir}/manifest18.json text_key: text pred_text_key: pred_text cer_threshold: 30 diff --git a/dataset_configs/commoncrawl/small_de_en.yaml b/dataset_configs/commoncrawl/small_de_en.yaml index cedb8f2e..55b7bb5e 100644 --- a/dataset_configs/commoncrawl/small_de_en.yaml +++ b/dataset_configs/commoncrawl/small_de_en.yaml @@ -1,6 +1,5 @@ -processors_to_run: "13" +processors_to_run: "0:" workspace_dir: /mnt/ssd8/cc_sdp/de_en -NEMO_GIT_FOLDER: /home/nkarpov/workspace/NeMo processors: - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue @@ -43,47 +42,10 @@ processors: cmd: "python /home/nkarpov/workspace/NeMo/examples/nlp/machine_translation/nmt_transformer_infer.py \ --model=${workspace_dir}/nmt_de_en_transformer12x2.nemo --target_lang=en --source_lang=de" - - _target_: sdp.processors.DropIfRegexMatch - output_manifest_file: ${workspace_dir}/manifest7.json - text_key: text - regex_patterns: - # - '://' - - '\\x' - - 'é' - - 'ô' - - '×' - - 'š' - - '\u202a' - - 'č' - - 'ć' - - 'á' - - 'ã' - - 'â' - - 'ï' - - '\u2060' - - '°' - - 'ñ' - - 'ŵ' - - 'à' - - 'ù' - - 'ò' - - 'ó' - - 'ő' - - 'ê' - - 'ă' - - 'ú' - - 'µ' - - '¿' - - "è" - - "é" - - "È" - - "É" - - "¡" - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest8.json + output_manifest_file: ${workspace_dir}/manifest7.json regex_params_list: - - {"pattern": '\[(.*?)\]', "repl": ' '} + - {"pattern": '+', "repl": ' '} - {"pattern": 'î', "repl": "i"} - {"pattern": 'ì', "repl": "i"} - {"pattern": 'í', "repl": "i"} @@ -109,26 +71,25 @@ processors: - {"pattern": " ", "repl": " "} - _target_: sdp.processors.DropHighLowWordrate - output_manifest_file: ${workspace_dir}/manifest9.json + output_manifest_file: ${workspace_dir}/manifest8.json high_wordrate_threshold: 100 low_wordrate_threshold: 0.01 - _target_: sdp.processors.datasets.commoncrawl.Subprocess - output_manifest_file: ${workspace_dir}/manifest10.json + output_manifest_file: ${workspace_dir}/manifest9.json input_manifest_arg: "--manifest" output_manifest_arg: "--output_filename" arg_separator: "=" cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \ --language=en --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache \ - --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv" - # --overwrite_cache + --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv" # --overwrite_cache - _target_: sdp.processors.RenameFields - output_manifest_file: ${workspace_dir}/manifest11.json + output_manifest_file: ${workspace_dir}/manifest10.json rename_fields: {"normalized":"text"} - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest12.json + output_manifest_file: ${workspace_dir}/manifest11.json text_key: text regex_params_list: - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"} @@ -144,16 +105,23 @@ processors: - {input: {text: "''cause it''s an adult novel versus ya"}, output: {text: "cause it's an adult novel versus ya"}} - _target_: sdp.processors.datasets.commoncrawl.BLEUScore - output_manifest_file: ${workspace_dir}/manifest13.json + output_manifest_file: ${workspace_dir}/manifest12.json ref_field: text hyp_field: pred_text output_field: bleu - _target_: sdp.processors.datasets.commoncrawl.UseSonar - output_manifest_file: ${workspace_dir}/manifest14.json + output_manifest_file: ${workspace_dir}/manifest13.json input_text_field: text input_audio_field: audio_filepath output_field: sonar_dist device: cuda + batch_size: 256 speech_encoder_model: sonar_speech_encoder_deu text_encoder_model: text_sonar_basic_encoder + + - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue + output_manifest_file: ${workspace_dir}/manifest14.json + input_field: bleu + target_value: 10 + operator: ge diff --git a/dataset_configs/commoncrawl/small_en.yaml b/dataset_configs/commoncrawl/small_en.yaml index 910bc480..1922dfe0 100644 --- a/dataset_configs/commoncrawl/small_en.yaml +++ b/dataset_configs/commoncrawl/small_en.yaml @@ -1,4 +1,4 @@ -processors_to_run: "0:" +processors_to_run: "3:" workspace_dir: /mnt/ssd8/cc_sdp/en processors: @@ -17,55 +17,14 @@ processors: output_manifest_file: ${workspace_dir}/manifest2.json pretrained_model: nvidia/stt_en_fastconformer_hybrid_large_pc batch_size: 64 - - - _target_: sdp.processors.DropIfRegexMatch - output_manifest_file: ${workspace_dir}/manifest3.json - regex_patterns: - # - '://' - # - "(\\s)+(www)\\.[a-zA-Z0-9/]+(\\s|$)+" - - '\\x' - - "www\\.wiki" - - "www\\.usgs\\." - - 'é' - - 'ô' - - '×' - - 'š' - - 'ö' - - 'ß' - - 'ä' - - 'ü' - - '\u202a' - - 'č' - - 'ć' - - 'á' - - 'ã' - - 'â' - - 'ï' - - '\u2060' - - 'ñ' - - 'ŵ' - - 'à' - - 'ù' - - 'ò' - - 'ó' - - 'ő' - - 'ê' - - 'ă' - - 'ú' - - 'µ' - - '¿' - - 'ë' - - "è" - - "é" - - "È" - - "É" - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest4.json + input_manifest_file: ${workspace_dir}/manifest2.json + output_manifest_file: ${workspace_dir}/manifest3.json duplicate_fields: {"text":"orig_text"} - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest5.json + output_manifest_file: ${workspace_dir}/manifest4.json regex_params_list: - {"pattern": '\[(.*?)\]', "repl": ' '} - {"pattern": 'î', "repl": "i"} @@ -97,40 +56,26 @@ processors: - {"pattern": " ", "repl": " "} - _target_: sdp.processors.DropHighLowWordrate - output_manifest_file: ${workspace_dir}/manifest6.json + output_manifest_file: ${workspace_dir}/manifest5.json high_wordrate_threshold: 100 low_wordrate_threshold: 0.01 - - _target_: sdp.processors.DropIfRegexMatch - output_manifest_file: ${workspace_dir}/manifest7.json - regex_patterns: - - "^\\s*$" - _target_: sdp.processors.datasets.commoncrawl.Subprocess - output_manifest_file: ${workspace_dir}/manifest8.json + output_manifest_file: ${workspace_dir}/manifest6.json input_manifest_arg: "--manifest" output_manifest_arg: "--output_filename" arg_separator: "=" cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \ --language=en --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache \ --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv" -# --overwrite_cache - # - _target_: sdp.processors.datasets.commoncrawl.Subprocess - # input_manifest_file: ${workspace_dir}/manifest6.json - # output_manifest_file: ${workspace_dir}/manifest7.json - # input_manifest_arg: "--input_file" - # output_manifest_arg: "--output_file" - # arg_separator: "=" - # cmd: ""python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize.py \ - # --language=es --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache \ - # --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv" - + - _target_: sdp.processors.RenameFields - output_manifest_file: ${workspace_dir}/manifest9.json + output_manifest_file: ${workspace_dir}/manifest7.json rename_fields: {"normalized":"text"} - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest10.json + output_manifest_file: ${workspace_dir}/manifest8.json text_key: text regex_params_list: - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"} @@ -146,20 +91,20 @@ processors: - {input: {text: "''cause it''s an adult novel versus ya"}, output: {text: "cause it's an adult novel versus ya"}} - _target_: sdp.processors.DropHighLowWordrate - output_manifest_file: ${workspace_dir}/manifest11.json + output_manifest_file: ${workspace_dir}/manifest9.json high_wordrate_threshold: 100 low_wordrate_threshold: 0.01 - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest12.json + output_manifest_file: ${workspace_dir}/manifest10.json duplicate_fields: {"text":"text_pc"} - _target_: sdp.processors.SubMakeLowercase - output_manifest_file: ${workspace_dir}/manifest13.json + output_manifest_file: ${workspace_dir}/manifest11.json text_key: text - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest14.json + output_manifest_file: ${workspace_dir}/manifest12.json text_key: text regex_params_list: - {"pattern": "[\\?\\.]", "repl": " "} @@ -167,15 +112,15 @@ processors: - {"pattern": " ", "repl": " "} - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest15.json + output_manifest_file: ${workspace_dir}/manifest13.json duplicate_fields: {"pred_text":"pred_text_pc"} - _target_: sdp.processors.SubMakeLowercase - output_manifest_file: ${workspace_dir}/manifest16.json + output_manifest_file: ${workspace_dir}/manifest14.json text_key: pred_text - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest17.json + output_manifest_file: ${workspace_dir}/manifest15.json text_key: pred_text regex_params_list: - {"pattern": "[\\?\\.]", "repl": " "} @@ -183,19 +128,19 @@ processors: - {"pattern": " ", "repl": " "} - _target_: sdp.processors.DropIfRegexMatch - output_manifest_file: ${workspace_dir}/manifest18.json + output_manifest_file: ${workspace_dir}/manifest16.json text_key: text regex_patterns: - "^\\s*$" - _target_: sdp.processors.DropHighWER - output_manifest_file: ${workspace_dir}/manifest19.json + output_manifest_file: ${workspace_dir}/manifest17.json text_key: text pred_text_key: pred_text wer_threshold: 75 - _target_: sdp.processors.DropHighCER - output_manifest_file: ${workspace_dir}/manifest20.json + output_manifest_file: ${workspace_dir}/manifest18.json text_key: text pred_text_key: pred_text cer_threshold: 30 diff --git a/dataset_configs/commoncrawl/small_es.yaml b/dataset_configs/commoncrawl/small_es.yaml index 458819f3..03b11418 100644 --- a/dataset_configs/commoncrawl/small_es.yaml +++ b/dataset_configs/commoncrawl/small_es.yaml @@ -1,4 +1,4 @@ -processors_to_run: "0:" +processors_to_run: "3:" workspace_dir: /mnt/ssd8/cc_sdp/es processors: @@ -18,42 +18,15 @@ processors: pretrained_model: nvidia/stt_es_fastconformer_hybrid_large_pc batch_size: 64 - - _target_: sdp.processors.DropIfRegexMatch - output_manifest_file: ${workspace_dir}/manifest3.json - regex_patterns: - # ó Ó á é É í Í ¿ ñ Ñ ú Ú - # - '://' - - '\\x' - - 'ô' - - '×' - - '½' - - 'š' - - 'ö' - - 'ß' - - 'ä' - - 'ü' - - '\u202a' - - 'č' - - 'ć' - - 'ã' - - 'â' - - 'ï' - - '\u2060' - - 'ŵ' - - 'ő' - - 'ê' - - 'ă' - - 'µ' - - '³' - - 'ë' - - "%" - - _target_: sdp.processors.DuplicateFields + input_manifest_file: ${workspace_dir}/manifest2.json + output_manifest_file: ${workspace_dir}/manifest3.json duplicate_fields: {"text":"orig_text"} - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest5.json + output_manifest_file: ${workspace_dir}/manifest4.json regex_params_list: + - {"pattern": '\[(.*?)\]', "repl": ' '} - {"pattern": 'î', "repl": "i"} - {"pattern": 'ì', "repl": "í"} - {"pattern": 'è', "repl": "é"} @@ -86,34 +59,34 @@ processors: - {"pattern": " ", "repl": " "} - _target_: sdp.processors.DropHighLowWordrate - output_manifest_file: ${workspace_dir}/manifest6.json + output_manifest_file: ${workspace_dir}/manifest5.json high_wordrate_threshold: 100 low_wordrate_threshold: 0.01 + # - _target_: sdp.processors.datasets.commoncrawl.Subprocess + # output_manifest_file: ${workspace_dir}/manifest6.json + # input_manifest_arg: "--input_file" + # output_manifest_arg: "--output_file" + # arg_separator: "=" + # cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize.py \ + # --language=es --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache \ + # --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/es/data/whitelist.tsv" + - _target_: sdp.processors.datasets.commoncrawl.Subprocess - output_manifest_file: ${workspace_dir}/manifest7.json - input_manifest_arg: "--input_file" - output_manifest_arg: "--output_file" + output_manifest_file: ${workspace_dir}/manifest6.json + input_manifest_arg: "--manifest" + output_manifest_arg: "--output_filename" arg_separator: "=" - cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize.py \ - --language=es --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache \ + cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \ + --language=es --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache \ --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/es/data/whitelist.tsv" - # - _target_: sdp.processors.datasets.commoncrawl.Subprocess - # input_manifest_file: ${workspace_dir}/manifest6.json - # output_manifest_file: ${workspace_dir}/manifest7.json - # input_manifest_arg: "--manifest" - # output_manifest_arg: "--output_filename" - # arg_separator: "=" - # cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \ - # --language=es --n_jobs=-1 --batch_size=600 --manifest_text_field=text" - - _target_: sdp.processors.RenameFields - output_manifest_file: ${workspace_dir}/manifest8.json + output_manifest_file: ${workspace_dir}/manifest7.json rename_fields: {"normalized":"text"} - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest9.json + output_manifest_file: ${workspace_dir}/manifest8.json text_key: text regex_params_list: - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"} @@ -130,20 +103,20 @@ processors: - {input: {text: "''cause it''s an adult novel versus ya"}, output: {text: "cause it's an adult novel versus ya"}} - _target_: sdp.processors.DropHighLowWordrate - output_manifest_file: ${workspace_dir}/manifest10.json + output_manifest_file: ${workspace_dir}/manifest9.json high_wordrate_threshold: 100 low_wordrate_threshold: 0.01 - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest11.json + output_manifest_file: ${workspace_dir}/manifest10.json duplicate_fields: {"text":"text_pc"} - _target_: sdp.processors.SubMakeLowercase - output_manifest_file: ${workspace_dir}/manifest12.json + output_manifest_file: ${workspace_dir}/manifest11.json text_key: text - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest13.json + output_manifest_file: ${workspace_dir}/manifest12.json text_key: text regex_params_list: - {"pattern": "[\\?\\.]", "repl": " "} @@ -152,15 +125,15 @@ processors: - {"pattern": " ", "repl": " "} - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest14.json + output_manifest_file: ${workspace_dir}/manifest13.json duplicate_fields: {"pred_text":"pred_text_pc"} - _target_: sdp.processors.SubMakeLowercase - output_manifest_file: ${workspace_dir}/manifest15.json + output_manifest_file: ${workspace_dir}/manifest14.json text_key: pred_text - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest16.json + output_manifest_file: ${workspace_dir}/manifest15.json text_key: pred_text regex_params_list: - {"pattern": "[\\?\\.]", "repl": " "} @@ -169,19 +142,19 @@ processors: - {"pattern": " ", "repl": " "} - _target_: sdp.processors.DropIfRegexMatch - output_manifest_file: ${workspace_dir}/manifest17.json + output_manifest_file: ${workspace_dir}/manifest16.json text_key: text regex_patterns: - "^\\s*$" - _target_: sdp.processors.DropHighWER - output_manifest_file: ${workspace_dir}/manifest18.json + output_manifest_file: ${workspace_dir}/manifest17.json text_key: text pred_text_key: pred_text wer_threshold: 75 - _target_: sdp.processors.DropHighCER - output_manifest_file: ${workspace_dir}/manifest19.json + output_manifest_file: ${workspace_dir}/manifest18.json text_key: text pred_text_key: pred_text cer_threshold: 30 diff --git a/dataset_configs/commoncrawl/small_fr.yaml b/dataset_configs/commoncrawl/small_fr.yaml index 62165784..0406c2fa 100644 --- a/dataset_configs/commoncrawl/small_fr.yaml +++ b/dataset_configs/commoncrawl/small_fr.yaml @@ -3,7 +3,7 @@ workspace_dir: /mnt/ssd8/cc_sdp/fr processors: - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue - input_manifest_file: /mnt/ssd8/cc_sdp/manifest7.json + input_manifest_file: /mnt/ssd8/cc_sdp/manifest8ps.json output_manifest_file: ${workspace_dir}/manifest0.json input_field: audio_lang preserve_value: fr @@ -14,58 +14,19 @@ processors: preserve_value: fr - _target_: sdp.processors.ASRInference - input_manifest_file: ${workspace_dir}/manifest1.json output_manifest_file: ${workspace_dir}/manifest2.json pretrained_model: nvidia/stt_fr_conformer_transducer_large # nvidia/stt_fr_fastconformer_hybrid_large_pc batch_size: 64 - _target_: sdp.processors.DuplicateFields - duplicate_fields: {"text":"orig_text"} - - - _target_: sdp.processors.DropIfRegexMatch - input_manifest_file: ${workspace_dir}/manifest2.json output_manifest_file: ${workspace_dir}/manifest3.json - regex_patterns: - # â à ê è È é É ë î ì ï ô û ù ü ÿ ç œ æ - # - '://' - - '\\x' - - '×' - - '½' - - 'š' - - '⁶' - - 'ö' - - 'ß' - - 'ä' - - 'ü' - - '\u202a' - - 'č' - - 'ć' - - 'á' - - 'ã' - - 'ï' - - '²' - - '\u2060' - - '°' - - 'ñ' - - 'ŵ' - - 'ù' - - 'ò' - - 'ó' - - 'ő' - - 'ă' - - 'ú' - - 'µ' - - '¿' - - 'ë' - - "%" - + duplicate_fields: {"text":"orig_text"} - _target_: sdp.processors.SubRegex - # input_manifest_file: ${workspace_dir}/manifest3.json output_manifest_file: ${workspace_dir}/manifest4.json - max_workers: 20 regex_params_list: - - {"pattern": '¡', "repl": "i"} + - {"pattern": '\[(.*?)\]', "repl": ' '} + - {"pattern": '¡', "repl": " "} - {"pattern": '‚', "repl": ","} - {"pattern": "’", "repl": "'"} - {"pattern": "[-–—]", "repl": " "} @@ -89,39 +50,29 @@ processors: - {"pattern": " ", "repl": " "} - _target_: sdp.processors.DropHighLowWordrate - # input_manifest_file: ${workspace_dir}/manifest4.json output_manifest_file: ${workspace_dir}/manifest5.json - high_wordrate_threshold: 1000 - low_wordrate_threshold: 0.001 + high_wordrate_threshold: 100 + low_wordrate_threshold: 0.01 - _target_: sdp.processors.SubRegex - # input_manifest_file: ${workspace_dir}/manifest5.json output_manifest_file: ${workspace_dir}/manifest6.json + text_key: normalized regex_params_list: - # - {"pattern": "'", "repl": " "} - # - {"pattern": '\-', "repl": " "} - # - {"pattern": '[\[\]\":\(\);\\\+\*]', "repl": ' '} - - {"pattern": '=', "repl": " "} - - {"pattern": '$', "repl": " "} - - {"pattern": '#', "repl": " "} - - {"pattern": '/', "repl": " "} - - {"pattern": '>', "repl": " "} - - {"pattern": '<', "repl": " "} - - {"pattern": '&', "repl": " "} - - {"pattern": '@', "repl": " "} + - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"} + - {"pattern": "^\\s*'*\\s*", "repl": ""} + - {"pattern": "'{2,}", "repl": "'"} - {"pattern": '\\.{3}', "repl": '.'} + - {"pattern": '\$', "repl": ""} + - {"pattern": "[^a-zA-ZàâçéèêëîïôûùüÿñæœÀÂÇÉÈÊËÎÏÔÛÙÜŸÑÆŒ.,?'-]", "repl": " "} - {"pattern": ' ', "repl": " "} - - _target_: sdp.processors.DropHighLowWordrate - # input_manifest_file: ${workspace_dir}/manifest6.json output_manifest_file: ${workspace_dir}/manifest7.json high_wordrate_threshold: 100 low_wordrate_threshold: 0.01 - _target_: sdp.processors.DuplicateFields - # input_manifest_file: ${workspace_dir}/manifest7.json output_manifest_file: ${workspace_dir}/manifest8.json duplicate_fields: {"text":"text_pc"} @@ -154,14 +105,12 @@ processors: - {"pattern": " ", "repl": " "} - _target_: sdp.processors.DropHighWER - input_manifest_file: ${workspace_dir}/manifest13.json output_manifest_file: ${workspace_dir}/manifest14.json text_key: text pred_text_key: pred_text wer_threshold: 75 - _target_: sdp.processors.DropHighCER - input_manifest_file: ${workspace_dir}/manifest14.json output_manifest_file: ${workspace_dir}/manifest15.json text_key: text pred_text_key: pred_text diff --git a/dataset_configs/commoncrawl/small_pl.yaml b/dataset_configs/commoncrawl/small_pl.yaml index 97808125..c2648ebb 100644 --- a/dataset_configs/commoncrawl/small_pl.yaml +++ b/dataset_configs/commoncrawl/small_pl.yaml @@ -3,7 +3,7 @@ workspace_dir: /mnt/ssd8/cc_sdp/pl processors: - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue - input_manifest_file: /mnt/ssd8/cc_sdp/manifest7.json + input_manifest_file: /mnt/ssd8/cc_sdp/manifest8ps.json output_manifest_file: ${workspace_dir}/manifest0.json input_field: audio_lang preserve_value: pl @@ -14,61 +14,18 @@ processors: preserve_value: pl - _target_: sdp.processors.ASRInference - input_manifest_file: ${workspace_dir}/manifest1.json output_manifest_file: ${workspace_dir}/manifest2.json pretrained_model: nvidia/stt_pl_fastconformer_hybrid_large_pc batch_size: 64 - - - _target_: sdp.processors.DropIfRegexMatch - input_manifest_file: ${workspace_dir}/manifest2.json - output_manifest_file: ${workspace_dir}/manifest3.json - regex_patterns: - # ę ą ł Ł ć Ć ż Ż ś Ś ń ó Ó ź Ź - # - '://' - # - '\\x' - - 'é' - - 'ô' - - '×' - - '½' - - 'š' - - '⁶' - - 'ö' - - 'ß' - - 'ä' - - 'ü' - - '\u202a' - - 'č' - - 'á' - - 'ã' - - 'â' - - 'ï' - - '\u2060' - - 'ñ' - - 'ŵ' - - 'à' - - 'ù' - - 'ò' - - 'ő' - - 'ê' - - 'ă' - - 'ú' - - 'µ' - - '¿' - - 'ë' - - "è" - - "é" - - "È" - - "É" - - "\\d" - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest4.json + output_manifest_file: ${workspace_dir}/manifest3.json duplicate_fields: {"text":"orig_text"} - _target_: sdp.processors.SubRegex - # input_manifest_file: ${workspace_dir}/manifest4.json - output_manifest_file: ${workspace_dir}/manifest5.json + output_manifest_file: ${workspace_dir}/manifest4.json regex_params_list: + - {"pattern": '\[(.*?)\]', "repl": ' '} - {"pattern": 'î', "repl": "i"} - {"pattern": 'ì', "repl": "i"} - {"pattern": 'í', "repl": "i"} @@ -95,49 +52,37 @@ processors: - {"pattern": " ", "repl": " "} - _target_: sdp.processors.DropHighLowWordrate - # input_manifest_file: ${workspace_dir}/manifest5.json - output_manifest_file: ${workspace_dir}/manifest6.json - high_wordrate_threshold: 1000 - low_wordrate_threshold: 0.001 - + output_manifest_file: ${workspace_dir}/manifest5.json + high_wordrate_threshold: 100 + low_wordrate_threshold: 0.01 - _target_: sdp.processors.SubRegex - # input_manifest_file: ${workspace_dir}/manifest6.json - output_manifest_file: ${workspace_dir}/manifest7.json - max_workers: 20 + output_manifest_file: ${workspace_dir}/manifest6.json + text_key: text regex_params_list: - - {"pattern": "'", "repl": " "} - - {"pattern": '[\[\]\":\(\);\\\-\+\*]', "repl": ' '} - - {"pattern": '=', "repl": " "} - - {"pattern": '$', "repl": " "} - - {"pattern": '#', "repl": " "} - - {"pattern": '/', "repl": " "} - - {"pattern": '>', "repl": " "} - - {"pattern": '<', "repl": " "} - - {"pattern": '&', "repl": " "} - - {"pattern": '@', "repl": " "} - - {"pattern": 'ç', "repl": "c"} + - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"} + - {"pattern": "^\\s*'*\\s*", "repl": ""} + - {"pattern": "'{2,}", "repl": "'"} - {"pattern": '\\.{3}', "repl": '.'} + - {"pattern": '\$', "repl": ""} + - {"pattern": "[^a-zA-ZĘꥹłŁćĆżŻśŚŃńóÓźŹ.,?]", "repl": " "} - {"pattern": ' ', "repl": " "} - - _target_: sdp.processors.DropHighLowWordrate - # input_manifest_file: ${workspace_dir}/manifest7.json - output_manifest_file: ${workspace_dir}/manifest8.json + output_manifest_file: ${workspace_dir}/manifest7.json high_wordrate_threshold: 100 low_wordrate_threshold: 0.01 - _target_: sdp.processors.DuplicateFields - # input_manifest_file: ${workspace_dir}/manifest8.json - output_manifest_file: ${workspace_dir}/manifest9.json + output_manifest_file: ${workspace_dir}/manifest8.json duplicate_fields: {"text":"text_pc"} - _target_: sdp.processors.SubMakeLowercase - output_manifest_file: ${workspace_dir}/manifest10.json + output_manifest_file: ${workspace_dir}/manifest9.json text_key: text - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest11.json + output_manifest_file: ${workspace_dir}/manifest10.json text_key: text regex_params_list: - {"pattern": "[\\?\\.\\!]", "repl": " "} @@ -145,15 +90,15 @@ processors: - {"pattern": " ", "repl": " "} - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest12.json + output_manifest_file: ${workspace_dir}/manifest11.json duplicate_fields: {"pred_text":"pred_text_pc"} - _target_: sdp.processors.SubMakeLowercase - output_manifest_file: ${workspace_dir}/manifest13.json + output_manifest_file: ${workspace_dir}/manifest12.json text_key: pred_text - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest14.json + output_manifest_file: ${workspace_dir}/manifest13.json text_key: pred_text regex_params_list: - {"pattern": "[\\?\\.\\!]", "repl": " "} @@ -161,15 +106,13 @@ processors: - {"pattern": " ", "repl": " "} - _target_: sdp.processors.DropHighWER - input_manifest_file: ${workspace_dir}/manifest14.json - output_manifest_file: ${workspace_dir}/manifest15.json + output_manifest_file: ${workspace_dir}/manifest14.json text_key: text pred_text_key: pred_text wer_threshold: 75 - _target_: sdp.processors.DropHighCER - input_manifest_file: ${workspace_dir}/manifest15.json - output_manifest_file: ${workspace_dir}/manifest16.json + output_manifest_file: ${workspace_dir}/manifest15.json text_key: text pred_text_key: pred_text cer_threshold: 30 \ No newline at end of file From d9b3473e1bcc2241b770557ab5510dad08985d9d Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Mon, 25 Sep 2023 01:54:11 -0700 Subject: [PATCH 004/115] add caption Signed-off-by: Nikolay Karpov --- dataset_configs/commoncrawl/small_de_en.yaml | 5 +- dataset_configs/commoncrawl/small_fr.yaml | 18 +-- dataset_configs/commoncrawl/small_pl.yaml | 17 +-- .../commoncrawl/small_sentence.yaml | 15 ++- .../datasets/commoncrawl/commoncrawl.py | 107 ++++++++++++------ 5 files changed, 103 insertions(+), 59 deletions(-) diff --git a/dataset_configs/commoncrawl/small_de_en.yaml b/dataset_configs/commoncrawl/small_de_en.yaml index 55b7bb5e..f6f6dd7a 100644 --- a/dataset_configs/commoncrawl/small_de_en.yaml +++ b/dataset_configs/commoncrawl/small_de_en.yaml @@ -1,4 +1,4 @@ -processors_to_run: "0:" +processors_to_run: "9" workspace_dir: /mnt/ssd8/cc_sdp/de_en processors: @@ -116,9 +116,10 @@ processors: input_audio_field: audio_filepath output_field: sonar_dist device: cuda - batch_size: 256 + batch_size: 64 speech_encoder_model: sonar_speech_encoder_deu text_encoder_model: text_sonar_basic_encoder + text_encoder_lang: eng_Latn - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue output_manifest_file: ${workspace_dir}/manifest14.json diff --git a/dataset_configs/commoncrawl/small_fr.yaml b/dataset_configs/commoncrawl/small_fr.yaml index 0406c2fa..f8699a91 100644 --- a/dataset_configs/commoncrawl/small_fr.yaml +++ b/dataset_configs/commoncrawl/small_fr.yaml @@ -57,7 +57,7 @@ processors: - _target_: sdp.processors.SubRegex output_manifest_file: ${workspace_dir}/manifest6.json - text_key: normalized + text_key: text regex_params_list: - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"} - {"pattern": "^\\s*'*\\s*", "repl": ""} @@ -67,27 +67,29 @@ processors: - {"pattern": "[^a-zA-ZàâçéèêëîïôûùüÿñæœÀÂÇÉÈÊËÎÏÔÛÙÜŸÑÆŒ.,?'-]", "repl": " "} - {"pattern": ' ', "repl": " "} - - _target_: sdp.processors.DropHighLowWordrate - output_manifest_file: ${workspace_dir}/manifest7.json - high_wordrate_threshold: 100 - low_wordrate_threshold: 0.01 - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest8.json + output_manifest_file: ${workspace_dir}/manifest7.json duplicate_fields: {"text":"text_pc"} - _target_: sdp.processors.SubMakeLowercase - output_manifest_file: ${workspace_dir}/manifest9.json + output_manifest_file: ${workspace_dir}/manifest8.json text_key: text - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest10.json + output_manifest_file: ${workspace_dir}/manifest9.json text_key: text regex_params_list: - {"pattern": "[\\?\\.\\!]", "repl": " "} - {"pattern": ",", "repl": " "} - {"pattern": " ", "repl": " "} + - _target_: sdp.processors.DropIfRegexMatch + output_manifest_file: ${workspace_dir}/manifest10.json + text_key: text + regex_patterns: + - "^\\s*$" + - _target_: sdp.processors.DuplicateFields output_manifest_file: ${workspace_dir}/manifest11.json duplicate_fields: {"pred_text":"pred_text_pc"} diff --git a/dataset_configs/commoncrawl/small_pl.yaml b/dataset_configs/commoncrawl/small_pl.yaml index c2648ebb..ba8d1bd2 100644 --- a/dataset_configs/commoncrawl/small_pl.yaml +++ b/dataset_configs/commoncrawl/small_pl.yaml @@ -68,27 +68,28 @@ processors: - {"pattern": "[^a-zA-ZĘꥹłŁćĆżŻśŚŃńóÓźŹ.,?]", "repl": " "} - {"pattern": ' ', "repl": " "} - - _target_: sdp.processors.DropHighLowWordrate - output_manifest_file: ${workspace_dir}/manifest7.json - high_wordrate_threshold: 100 - low_wordrate_threshold: 0.01 - - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest8.json + output_manifest_file: ${workspace_dir}/manifest7.json duplicate_fields: {"text":"text_pc"} - _target_: sdp.processors.SubMakeLowercase - output_manifest_file: ${workspace_dir}/manifest9.json + output_manifest_file: ${workspace_dir}/manifest8.json text_key: text - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest10.json + output_manifest_file: ${workspace_dir}/manifest9.json text_key: text regex_params_list: - {"pattern": "[\\?\\.\\!]", "repl": " "} - {"pattern": ",", "repl": " "} - {"pattern": " ", "repl": " "} + - _target_: sdp.processors.DropIfRegexMatch + output_manifest_file: ${workspace_dir}/manifest10.json + text_key: text + regex_patterns: + - "^\\s*$" + - _target_: sdp.processors.DuplicateFields output_manifest_file: ${workspace_dir}/manifest11.json duplicate_fields: {"pred_text":"pred_text_pc"} diff --git a/dataset_configs/commoncrawl/small_sentence.yaml b/dataset_configs/commoncrawl/small_sentence.yaml index 7c297462..abe5d057 100644 --- a/dataset_configs/commoncrawl/small_sentence.yaml +++ b/dataset_configs/commoncrawl/small_sentence.yaml @@ -49,24 +49,27 @@ processors: - _target_: sdp.processors.datasets.commoncrawl.ReadParquet raw_data_dir: /mnt/ssd8/cc_download/videos_and_transcripts_large_16_cores_output/video_output2 - output_manifest_file: ${workspace_dir}/manifest6ps.json - output_text_field: url + output_manifest_file: ${workspace_dir}/manifest6.json + output_video_field: video + output_vtt_field: caption key_field: key - _target_: sdp.processors.datasets.commoncrawl.SplitByVttSentence - output_manifest_file: ${workspace_dir}/manifest7ps.json + output_manifest_file: ${workspace_dir}/manifest7.json splited_audio_dir: ${workspace_dir}/splited_s/ source_audio_field: audios audio_lang_field: audio_lang text_lang_field: text_lang - key_field: "key" + url_video_field: video + url_vtt_field: caption + key_field: key target_audio_field: "audio_filepath" duration_field: "duration" text_field: "text" - vtt_field: "vtt_filepath" + vtt_field: "vtt_filepath" # audio duration splited 532.25 - _target_: sdp.processors.DropHighLowDuration - output_manifest_file: ${workspace_dir}/manifest8ps.json + output_manifest_file: ${workspace_dir}/manifest8.json high_duration_threshold: 40 low_duration_threshold: 0.02 diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py index c25f1aab..3eec03af 100644 --- a/sdp/processors/datasets/commoncrawl/commoncrawl.py +++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py @@ -12,6 +12,7 @@ from sdp.processors.base_processor import BaseProcessor, BaseParallelProcessor, DataEntry from sdp.logging import logger from sdp.processors.datasets.commoncrawl.harv_utils import ffmpeg_convert, txt2vtt, make_trans_list, get_vtt_text, text2lid, load_manifest, read_jsonl, write_jsonl, split_by_vtt_new +from scipy.spatial import distance class UseSonar(BaseProcessor): """ @@ -24,6 +25,7 @@ def __init__( input_audio_field: str, output_field: str, speech_encoder_model: str, + text_encoder_lang: str, text_encoder_model: str, batch_size: int = 64, device: str = "cuda", @@ -32,6 +34,8 @@ def __init__( super().__init__(**kwargs) import torch # importing after nemo to make sure users first install nemo, instead of torch, then nemo from torch.nn import PairwiseDistance + from sonar.inference_pipelines.text import TextToEmbeddingModelPipeline + from sonar.inference_pipelines.speech import SpeechToEmbeddingModelPipeline from sonar.models.sonar_speech.loader import load_sonar_speech_model from sonar.models.sonar_text import ( @@ -44,37 +48,55 @@ def __init__( self.input_audio_field = input_audio_field self.batch_size = batch_size self.device = device + self.text_encoder_lang = text_encoder_lang self.text_encoder_model = load_sonar_text_encoder_model(text_encoder_model, device=self.device).eval() self.text_tokenizer = load_sonar_tokenizer(text_encoder_model) self.speech_encoder_model = load_sonar_speech_model(speech_encoder_model, device=self.device).eval() self.pdist = PairwiseDistance(p=2) + self.s2vec_model = SpeechToEmbeddingModelPipeline(encoder=self.speech_encoder_model) + self.text_embedding_pipeline = TextToEmbeddingModelPipeline(self.text_encoder_model, self.text_tokenizer) def process(self): - from sonar.inference_pipelines.text import TextToEmbeddingModelPipeline - from sonar.inference_pipelines.speech import SpeechToEmbeddingModelPipeline - s2vec_model = SpeechToEmbeddingModelPipeline(encoder=self.speech_encoder_model) - text_embedding_pipeline = TextToEmbeddingModelPipeline(self.text_encoder_model, self.text_tokenizer) + manifest = load_manifest(Path(self.input_manifest_file)) - manifest, dir_list = load_manifest(Path(self.input_manifest_file), keys = [self.input_audio_field, self.input_text_field]) + Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True) + with Path(self.output_manifest_file).open('w') as f: + for item in tqdm(manifest): + input_texts = [item[self.input_text_field]] + input_audios = [item[self.input_audio_field]] + dist = self.get_pdist(input_texts, input_audios) + item[self.output_field] = dist + f.write(json.dumps(item, ensure_ascii=False) + '\n') - text_emb = text_embedding_pipeline.predict(input = dir_list[self.input_text_field], - batch_size = self.batch_size, - source_lang="eng_Latn") + def get_pdist(self, input_texts, input_audios): + text_emb = self.text_embedding_pipeline.predict(input = input_texts, + batch_size = 1, + source_lang=self.text_encoder_lang) - audio_emb = s2vec_model.predict(input = dir_list[self.input_audio_field], - batch_size = self.batch_size, - n_parallel = 20, + audio_emb = self.s2vec_model.predict(input = input_audios, + batch_size = 1, + n_parallel = 1, pad_idx = 0, - n_prefetched_batches = 2,) - - pdist = self.pdist(text_emb, audio_emb).numpy().astype(float) - + n_prefetched_batches = 1,) + # pdist = self.pdist(text_emb, audio_emb).numpy().squeeze().astype(float).tolist() + pdist = distance.cdist(text_emb.numpy().astype(float), audio_emb.numpy().astype(float), 'sqeuclidean').squeeze().tolist() + return pdist + + def process_batch(self): + manifest, dict_list = load_manifest(Path(self.input_manifest_file), keys = [self.input_audio_field, self.input_text_field]) + manifest_len = len(manifest) Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True) - assert(len(manifest)==len(pdist)) with Path(self.output_manifest_file).open('w') as f: - for item, dist in tqdm(zip(manifest,pdist)): - item[self.output_field] = dist - f.write(json.dumps(item, ensure_ascii=False) + '\n') + for start in tqdm(range(0, manifest_len, self.batch_size)): + stop = start + self.batch_size + input_texts = dict_list[self.input_text_field][start:stop] + input_audios = dict_list[self.input_audio_field][start:stop] + manifest_batch = manifest[start:stop] + + dists = self.get_pdist(input_texts, input_audios) + for item, dist in zip(manifest_batch, dists): + item[self.output_field] = dist + f.write(json.dumps(item, ensure_ascii=False) + '\n') class BLEUScore(BaseParallelProcessor): """ @@ -271,6 +293,8 @@ def __init__( duration_field: str, text_field: str, vtt_field: str, + url_video_field: str, + url_vtt_field: str, duration_threshold: float = 10.0, **kwargs, ): @@ -285,6 +309,8 @@ def __init__( self.text_field = text_field self.vtt_field = vtt_field self.duration_threshold = duration_threshold + self.url_video_field = url_video_field + self.url_vtt_field = url_vtt_field def prepare(self): os.makedirs(self.splited_audio_dir, exist_ok=True) @@ -308,7 +334,7 @@ def process_dataset_entry(self, data_entry): else: pass end_c = end_sr - if len(text_c)>0 and (end_c - start_c > self.duration_threshold * 16000 or text_c[-1] == "." or text_c[-1] == "?"): + if len(text_c)>0 and (end_c - start_c > self.duration_threshold * samplerate or text_c[-1] == "." or text_c[-1] == "?"): res_list.append(self.makeDataEntry(data_entry, data, vtt_file, samplerate, text_c, key, start_c, end_c)) text_c = '' start_c, end_c = 0, 0 @@ -321,15 +347,19 @@ def process_dataset_entry(self, data_entry): def makeDataEntry(self, data_entry, data, vtt_file, samplerate, text_c, key, start_c, end_c): data_sample = data[start_c:end_c] - wav_save_file = os.path.join(self.splited_audio_dir, '/'.join(os.path.splitext(vtt_file)[0].split('/')[-2:]), str(int(start_c/16))+"-"+str(int(end_c/16))+".wav") + wav_save_file = os.path.join(self.splited_audio_dir, '/'.join(os.path.splitext(vtt_file)[0].split('/')[-2:]), str(int(start_c/(samplerate/1000)))+"-"+str(int(end_c/(samplerate/1000)))+".wav") os.makedirs(os.path.split(wav_save_file)[0], exist_ok=True) sf.write(wav_save_file, data_sample, samplerate) return DataEntry(data = {self.target_audio_field: wav_save_file, self.duration_field: data_sample.shape[0]/samplerate, - self.text_field: text_c, + self.text_field: text_c.strip(), self.audio_lang_field: data_entry[self.audio_lang_field], self.text_lang_field: data_entry[self.text_lang_field], - self.key_field: key}) + self.url_video_field: data_entry[self.url_video_field], + self.url_vtt_field: data_entry[self.url_vtt_field], + self.key_field: key, + }) + class SplitByVtt(BaseParallelProcessor): """ @@ -556,34 +586,41 @@ class ReadParquet(BaseParallelProcessor): """ def __init__( self, - output_text_field: str, + output_video_field: str, + output_vtt_field: str, key_field: str, raw_data_dir: str, **kwargs, ): super().__init__(**kwargs) - self.output_text_field = output_text_field + self.output_video_field = output_video_field + self.output_vtt_field = output_vtt_field self.key_field = key_field self.raw_data_dir = Path(raw_data_dir) def prepare(self): parquets = [str(self.raw_data_dir / p) for p in self.raw_data_dir.rglob('*.parquet')] self.urls = None - for parquet in parquets: - df1 = pd.read_parquet(parquet).sort_values("key").set_index("key") - if self.urls is None: - self.urls = df1 - else: - self.urls = pd.concat([self.urls, df1]) - + for parquet in tqdm(parquets): + try: + df1 = pd.read_parquet(parquet, engine='fastparquet').sort_values("key").set_index("key") + if self.urls is None: + self.urls = df1 + else: + self.urls = pd.concat([self.urls, df1]) + except Exception as e: + logger.warning(str(e) + ", file: " + parquet) + def process_dataset_entry(self, data_entry): key = data_entry[self.key_field] key = key.split("/")[1] try: - data_entry[self.output_text_field] = self.urls.loc[key]['url'] + data_entry[self.output_video_field] = self.urls.loc[key]['url'] + data_entry[self.output_vtt_field] = self.urls.loc[key]['caption'] except: - data_entry[self.output_text_field] = "NN" - logger.warning("Key: " + key) + data_entry[self.output_video_field] = "NN" + data_entry[self.output_vtt_field] = "NN" + logger.warning("Key without URL or caption: " + key) return [DataEntry(data=data_entry)] class CreateInitialManifestCC(BaseParallelProcessor): From 9a74b30e2fe928edb856c7a63907305afd2b518f Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Mon, 25 Sep 2023 02:44:17 -0700 Subject: [PATCH 005/115] proxy_fields Signed-off-by: Nikolay Karpov --- .../commoncrawl/small_sentence.yaml | 9 ++--- .../datasets/commoncrawl/commoncrawl.py | 36 +++++++------------ 2 files changed, 15 insertions(+), 30 deletions(-) diff --git a/dataset_configs/commoncrawl/small_sentence.yaml b/dataset_configs/commoncrawl/small_sentence.yaml index abe5d057..bb5c16c0 100644 --- a/dataset_configs/commoncrawl/small_sentence.yaml +++ b/dataset_configs/commoncrawl/small_sentence.yaml @@ -58,16 +58,11 @@ processors: output_manifest_file: ${workspace_dir}/manifest7.json splited_audio_dir: ${workspace_dir}/splited_s/ source_audio_field: audios - audio_lang_field: audio_lang - text_lang_field: text_lang - url_video_field: video - url_vtt_field: caption - key_field: key + vtt_field: "vtt_filepath" target_audio_field: "audio_filepath" duration_field: "duration" text_field: "text" - vtt_field: "vtt_filepath" - # audio duration splited 532.25 + proxy_fields: [audio_lang, text_lang, video, caption] - _target_: sdp.processors.DropHighLowDuration output_manifest_file: ${workspace_dir}/manifest8.json diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py index 3eec03af..d4791004 100644 --- a/sdp/processors/datasets/commoncrawl/commoncrawl.py +++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py @@ -286,37 +286,28 @@ def __init__( self, splited_audio_dir: str, source_audio_field: str, - text_lang_field: str, - audio_lang_field: str, - key_field: str, target_audio_field: str, duration_field: str, text_field: str, vtt_field: str, - url_video_field: str, - url_vtt_field: str, + proxy_fields: List[str] = [], duration_threshold: float = 10.0, **kwargs, ): super().__init__(**kwargs) self.splited_audio_dir = splited_audio_dir self.source_audio_field = source_audio_field - self.text_lang_field = text_lang_field - self.audio_lang_field = audio_lang_field - self.key_field = key_field self.target_audio_field = target_audio_field self.duration_field = duration_field self.text_field = text_field self.vtt_field = vtt_field self.duration_threshold = duration_threshold - self.url_video_field = url_video_field - self.url_vtt_field = url_vtt_field + self.proxy_fields = proxy_fields def prepare(self): os.makedirs(self.splited_audio_dir, exist_ok=True) def process_dataset_entry(self, data_entry): - key = data_entry[self.key_field] vtt_file = data_entry[self.vtt_field] source_audio = data_entry[self.source_audio_field] res_list = [] @@ -335,30 +326,29 @@ def process_dataset_entry(self, data_entry): pass end_c = end_sr if len(text_c)>0 and (end_c - start_c > self.duration_threshold * samplerate or text_c[-1] == "." or text_c[-1] == "?"): - res_list.append(self.makeDataEntry(data_entry, data, vtt_file, samplerate, text_c, key, start_c, end_c)) + res_list.append(self.makeDataEntry(data_entry, data, vtt_file, samplerate, text_c, start_c, end_c)) text_c = '' start_c, end_c = 0, 0 else: pass if len(text_c)>0 and start_c!=0: - res_list.append(self.makeDataEntry(data_entry, data, vtt_file, samplerate, text_c, key, start_c, end_c)) + res_list.append(self.makeDataEntry(data_entry, data, vtt_file, samplerate, text_c, start_c, end_c)) return res_list - def makeDataEntry(self, data_entry, data, vtt_file, samplerate, text_c, key, start_c, end_c): + def makeDataEntry(self, data_entry, data, vtt_file, samplerate, text_c, start_c, end_c): data_sample = data[start_c:end_c] wav_save_file = os.path.join(self.splited_audio_dir, '/'.join(os.path.splitext(vtt_file)[0].split('/')[-2:]), str(int(start_c/(samplerate/1000)))+"-"+str(int(end_c/(samplerate/1000)))+".wav") os.makedirs(os.path.split(wav_save_file)[0], exist_ok=True) sf.write(wav_save_file, data_sample, samplerate) - return DataEntry(data = {self.target_audio_field: wav_save_file, - self.duration_field: data_sample.shape[0]/samplerate, - self.text_field: text_c.strip(), - self.audio_lang_field: data_entry[self.audio_lang_field], - self.text_lang_field: data_entry[self.text_lang_field], - self.url_video_field: data_entry[self.url_video_field], - self.url_vtt_field: data_entry[self.url_vtt_field], - self.key_field: key, - }) + + data = {self.target_audio_field: wav_save_file, + self.duration_field: data_sample.shape[0]/samplerate, + self.text_field: text_c.strip(), + } + for proxy_field in self.proxy_fields: + data[proxy_field] = data_entry[proxy_field] + return DataEntry(data = data) class SplitByVtt(BaseParallelProcessor): From f2c8f2bb946ca9da5b066d7fa05d4e4969d8c289 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Mon, 25 Sep 2023 03:07:23 -0700 Subject: [PATCH 006/115] duration_threshold Signed-off-by: Nikolay Karpov --- dataset_configs/commoncrawl/small_sentence.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dataset_configs/commoncrawl/small_sentence.yaml b/dataset_configs/commoncrawl/small_sentence.yaml index bb5c16c0..4f8ebda3 100644 --- a/dataset_configs/commoncrawl/small_sentence.yaml +++ b/dataset_configs/commoncrawl/small_sentence.yaml @@ -63,7 +63,8 @@ processors: duration_field: "duration" text_field: "text" proxy_fields: [audio_lang, text_lang, video, caption] - + duration_threshold: 10.0 + - _target_: sdp.processors.DropHighLowDuration output_manifest_file: ${workspace_dir}/manifest8.json high_duration_threshold: 40 From 199bc22842ebaf83f6069b97e0847d52bc7a3cd0 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Mon, 25 Sep 2023 03:13:36 -0700 Subject: [PATCH 007/115] big Signed-off-by: Nikolay Karpov --- dataset_configs/commoncrawl/big.yaml | 75 +++++++ dataset_configs/commoncrawl/big_de.yaml | 151 +++++++++++++ dataset_configs/commoncrawl/big_de_en.yaml | 142 ++++++++++++ dataset_configs/commoncrawl/big_en.yaml | 202 ++++++++++++++++++ dataset_configs/commoncrawl/big_en_de.yaml | 131 ++++++++++++ dataset_configs/commoncrawl/big_en_fr.yaml | 122 +++++++++++ dataset_configs/commoncrawl/big_es.yaml | 155 ++++++++++++++ dataset_configs/commoncrawl/big_fr.yaml | 145 +++++++++++++ dataset_configs/commoncrawl/big_fr_en.yaml | 138 ++++++++++++ dataset_configs/commoncrawl/big_pl.yaml | 125 +++++++++++ dataset_configs/commoncrawl/big_sentence.yaml | 70 ++++++ 11 files changed, 1456 insertions(+) create mode 100644 dataset_configs/commoncrawl/big.yaml create mode 100644 dataset_configs/commoncrawl/big_de.yaml create mode 100644 dataset_configs/commoncrawl/big_de_en.yaml create mode 100644 dataset_configs/commoncrawl/big_en.yaml create mode 100644 dataset_configs/commoncrawl/big_en_de.yaml create mode 100644 dataset_configs/commoncrawl/big_en_fr.yaml create mode 100644 dataset_configs/commoncrawl/big_es.yaml create mode 100644 dataset_configs/commoncrawl/big_fr.yaml create mode 100644 dataset_configs/commoncrawl/big_fr_en.yaml create mode 100644 dataset_configs/commoncrawl/big_pl.yaml create mode 100644 dataset_configs/commoncrawl/big_sentence.yaml diff --git a/dataset_configs/commoncrawl/big.yaml b/dataset_configs/commoncrawl/big.yaml new file mode 100644 index 00000000..7af211ea --- /dev/null +++ b/dataset_configs/commoncrawl/big.yaml @@ -0,0 +1,75 @@ +processors_to_run: "0:" +workspace_dir: /mnt/md1/common_crawl/cc_sdp + +processors: + - _target_: sdp.processors.datasets.cc.cc.CreateInitialManifestCC + raw_data_dir: /mnt/md0/common_crawl/output/video_output2 + output_manifest_file: ${workspace_dir}/manifest0.json + resampled_audio_dir: ${workspace_dir}/audio/ + target_samplerate: 16000 + target_nchannels: 1 + audio_field: "audios" + video_field: "videos" + key_field: "key" + text_field: "texts" + + - _target_: sdp.processors.datasets.cc.cc.TxtToVtt + input_manifest_file: ${workspace_dir}/manifest0.json + output_manifest_file: ${workspace_dir}/manifest1.json + vtt_files_dir: ${workspace_dir}/vtts/ + key_field: "key" + text_field: "texts" + vtt_field: "vtt_filepath" + + - _target_: sdp.processors.datasets.cc.cc.AllVttText + input_manifest_file: ${workspace_dir}/manifest1.json + output_manifest_file: ${workspace_dir}/manifest2.json + input_filepath_field: vtt_filepath + output_text_field: vtt_text + + - _target_: sdp.processors.datasets.cc.cc.TextLid + input_manifest_file: ${workspace_dir}/manifest2.json + output_manifest_file: ${workspace_dir}/manifest3.json + input_text_field: vtt_text + output_lang_field: text_lang + device: cuda + pretrained_model: "jb2k/bert-base-multilingual-cased-language-detection" + + - _target_: sdp.processors.datasets.cc.cc.Lang2Iso + input_manifest_file: ${workspace_dir}/manifest3.json + output_manifest_file: ${workspace_dir}/manifest4.json + input_lang_field: text_lang + output_lang_field: text_lang + + - _target_: sdp.processors.datasets.cc.cc.AudioLid + input_manifest_file: ${workspace_dir}/manifest4.json + output_manifest_file: ${workspace_dir}/manifest5.json + input_audio_field: audios + output_lang_field: audio_lang + device: cuda + pretrained_model: "langid_ambernet" + + - _target_: sdp.processors.datasets.cc.cc.SplitByVtt + input_manifest_file: ${workspace_dir}/manifest5.json + output_manifest_file: ${workspace_dir}/manifest6.json + splited_audio_dir: ${workspace_dir}/splited/ + source_audio_field: audios + audio_lang_field: audio_lang + text_lang_field: text_lang + key_field: "key" + target_audio_field: "audio_filepath" + duration_field: "duration" + text_field: "text" + vtt_field: "vtt_filepath" + + - _target_: sdp.processors.DropHighLowDuration + output_manifest_file: ${workspace_dir}/manifest7.json + high_duration_threshold: 40 + low_duration_threshold: 0.2 + + - _target_: sdp.processors.datasets.cc.cc.ReadParquet + raw_data_dir: /mnt/md0/common_crawl/output/video_output2 + output_manifest_file: ${workspace_dir}/manifest8.json + output_video_field: video + output_vtt_field: caption + key_field: key diff --git a/dataset_configs/commoncrawl/big_de.yaml b/dataset_configs/commoncrawl/big_de.yaml new file mode 100644 index 00000000..b09207cb --- /dev/null +++ b/dataset_configs/commoncrawl/big_de.yaml @@ -0,0 +1,151 @@ +processors_to_run: "0:" # ü ä ö ß Ä Ö Ü +workspace_dir: /mnt/md0/common_crawl/cc_sdp/de + +processors: + - _target_: sdp.processors.datasets.cc.cc.PreserveByValue + input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8ps.json + output_manifest_file: ${workspace_dir}/manifest0.json + input_field: audio_lang + target_value: de + + - _target_: sdp.processors.datasets.cc.cc.PreserveByValue + output_manifest_file: ${workspace_dir}/manifest1.json + input_field: text_lang + target_value: de + + - _target_: sdp.processors.ASRInference + output_manifest_file: ${workspace_dir}/manifest2.json + pretrained_model: nvidia/stt_de_fastconformer_hybrid_large_pc + batch_size: 64 + + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest3.json + duplicate_fields: {"text":"orig_text"} + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest4.json + text_key: text + regex_params_list: + - {"pattern": '\[(.*?)\]', "repl": ' '} + - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"} + - {"pattern": 'î', "repl": "i"} + - {"pattern": 'ì', "repl": "i"} + - {"pattern": 'í', "repl": "i"} + - {"pattern": '‚', "repl": ","} + - {"pattern": "’", "repl": "'"} + - {"pattern": "[-–—]", "repl": " "} + - {"pattern": '―', "repl": "-"} + - {"pattern": '—', "repl": "-"} + - {"pattern": '⁺', "repl": "+"} + - {"pattern": '“', "repl": '"'} + - {"pattern": '”', "repl": '"'} + - {"pattern": '…', "repl": '.'} + - {"pattern": '‘', "repl": "'"} + - {"pattern": '′', "repl": "'"} + - {"pattern": '`', "repl": "'"} + - {"pattern": '⁻', "repl": "-"} + - {"pattern": '‑', "repl": "-"} + - {"pattern": '¶', "repl": ' '} + - {"pattern": '«', "repl": '"'} + - {"pattern": '»', "repl": '"'} + - {"pattern": '„', "repl": '"'} + - {"pattern": '®', "repl": ' '} + - {"pattern": '@', "repl": " "} + - {"pattern": " ", "repl": " "} + + - _target_: sdp.processors.DropHighLowWordrate + output_manifest_file: ${workspace_dir}/manifest5.json + high_wordrate_threshold: 100 + low_wordrate_threshold: 0.01 + + - _target_: sdp.processors.DropIfRegexMatch + output_manifest_file: ${workspace_dir}/manifest6.json + text_key: text + regex_patterns: + - "^\\s*$" + + - _target_: sdp.processors.datasets.cc.cc.Subprocess + output_manifest_file: ${workspace_dir}/manifest7.json + input_manifest_arg: "--manifest" + output_manifest_arg: "--output_filename" + arg_separator: "=" + cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \ + --language=de --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache \ + --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/de/data/whitelist.tsv" + # --overwrite_cache + + # - _target_: sdp.processors.datasets.cc.cc.Subprocess + # input_manifest_file: ${workspace_dir}/manifest6.json + # output_manifest_file: ${workspace_dir}/manifest7.json + # input_manifest_arg: "--input_file" + # output_manifest_arg: "--output_file" + # arg_separator: "=" + # cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize.py \ + # --language=de --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache --overwrite_cache \ + # --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/de/data/whitelist.tsv" + + + - _target_: sdp.processors.RenameFields + output_manifest_file: ${workspace_dir}/manifest8.json + rename_fields: {"normalized":"text"} + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest9.json + text_key: text + regex_params_list: + - {"pattern": '\\.{3}', "repl": '.'} + - {"pattern": '\$', "repl": ""} + - {"pattern": "'", "repl": " "} + - {"pattern": "[^a-zA-ZäöüÄÖÜß.,?]", "repl": " "} + - {"pattern": ' ', "repl": " "} + + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest10.json + duplicate_fields: {"text":"text_pc"} + + - _target_: sdp.processors.SubMakeLowercase + output_manifest_file: ${workspace_dir}/manifest11.json + text_key: text + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest12.json + text_key: text + regex_params_list: + - {"pattern": "[\\?\\.]", "repl": " "} + - {"pattern": ",", "repl": " "} + - {"pattern": " ", "repl": " "} + + - _target_: sdp.processors.DropIfRegexMatch + output_manifest_file: ${workspace_dir}/manifest13.json + text_key: text + regex_patterns: + - "^\\s*$" + + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest14.json + duplicate_fields: {"pred_text":"pred_text_pc"} + + - _target_: sdp.processors.SubMakeLowercase + output_manifest_file: ${workspace_dir}/manifest15.json + text_key: pred_text + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest16.json + text_key: pred_text + regex_params_list: + - {"pattern": "[\\?\\.]", "repl": " "} + - {"pattern": ",", "repl": " "} + - {"pattern": " ", "repl": " "} + + - _target_: sdp.processors.DropHighWER + output_manifest_file: ${workspace_dir}/manifest17.json + text_key: text + pred_text_key: pred_text + wer_threshold: 75 + + - _target_: sdp.processors.DropHighCER + output_manifest_file: ${workspace_dir}/manifest18.json + text_key: text + pred_text_key: pred_text + cer_threshold: 30 + \ No newline at end of file diff --git a/dataset_configs/commoncrawl/big_de_en.yaml b/dataset_configs/commoncrawl/big_de_en.yaml new file mode 100644 index 00000000..07d57983 --- /dev/null +++ b/dataset_configs/commoncrawl/big_de_en.yaml @@ -0,0 +1,142 @@ +processors_to_run: "14:" +workspace_dir: /mnt/md0/common_crawl/cc_sdp/de_en + +processors: + - _target_: sdp.processors.datasets.cc.cc.PreserveByValue + input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8ps.json + output_manifest_file: ${workspace_dir}/manifest0.json + input_field: audio_lang + target_value: de + + - _target_: sdp.processors.datasets.cc.cc.PreserveByValue + output_manifest_file: ${workspace_dir}/manifest1.json + input_field: text_lang + target_value: en + + - _target_: sdp.processors.ASRInference + output_manifest_file: ${workspace_dir}/manifest2.json + pretrained_model: nvidia/stt_de_fastconformer_hybrid_large_pc + batch_size: 64 + + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest3.json + duplicate_fields: {"text":"orig_text"} + + - _target_: sdp.processors.RenameFields + output_manifest_file: ${workspace_dir}/manifest4.json + rename_fields: {"pred_text": "asr_text"} + + - _target_: sdp.processors.DropIfRegexMatch + output_manifest_file: ${workspace_dir}/manifest5.json + text_key: asr_text + regex_patterns: + - "^\\s*$" + + - _target_: sdp.processors.datasets.cc.cc.NmtSubprocess + output_manifest_file: ${workspace_dir}/manifest6.json + arg_separator: "=" + srctext_file: ${workspace_dir}/srctext.txt # --srctext=${workspace_dir}/srctext.txt + tgtout_file: ${workspace_dir}/tgtout.txt # --tgtout=${workspace_dir}/tgtout.txt + input_field: "asr_text" + output_field: "pred_text" + cmd: "python /home/nkarpov/workspace/NeMo/examples/nlp/machine_translation/nmt_transformer_infer.py \ + --model=${workspace_dir}/nmt_de_en_transformer12x2.nemo --target_lang=en --source_lang=de" + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest7.json + regex_params_list: + - {"pattern": '\[(.*?)\]', "repl": ' '} + - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"} + - {"pattern": 'î', "repl": "i"} + - {"pattern": 'ì', "repl": "i"} + - {"pattern": 'í', "repl": "i"} + - {"pattern": '‚', "repl": ","} + - {"pattern": "’", "repl": "'"} + - {"pattern": "[-–—]", "repl": " "} + - {"pattern": '―', "repl": "-"} + - {"pattern": '—', "repl": "-"} + - {"pattern": '⁺', "repl": "+"} + - {"pattern": '“', "repl": '"'} + - {"pattern": '”', "repl": '"'} + - {"pattern": '…', "repl": '.'} + - {"pattern": '‘', "repl": "'"} + - {"pattern": '′', "repl": "'"} + - {"pattern": '`', "repl": "'"} + - {"pattern": '⁻', "repl": "-"} + - {"pattern": '‑', "repl": "-"} + - {"pattern": '¶', "repl": ' '} + - {"pattern": '«', "repl": '"'} + - {"pattern": '»', "repl": '"'} + - {"pattern": '„', "repl": '"'} + - {"pattern": '®', "repl": ' '} + - {"pattern": " ", "repl": " "} + + - _target_: sdp.processors.DropHighLowWordrate + output_manifest_file: ${workspace_dir}/manifest8.json + high_wordrate_threshold: 100 + low_wordrate_threshold: 0.01 + + - _target_: sdp.processors.datasets.cc.cc.Subprocess + output_manifest_file: ${workspace_dir}/manifest9.json + input_manifest_arg: "--manifest" + output_manifest_arg: "--output_filename" + arg_separator: "=" + cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \ + --language=en --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache --overwrite_cache \ + --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv" + + - _target_: sdp.processors.RenameFields + output_manifest_file: ${workspace_dir}/manifest10.json + rename_fields: {"normalized":"text"} + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest11.json + text_key: text + regex_params_list: + - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"} + - {"pattern": "^\\s*'*\\s*", "repl": ""} + - {"pattern": "'{2,}", "repl": "'"} + - {"pattern": '\\.{3}', "repl": '.'} + - {"pattern": '\$', "repl": ""} + - {"pattern": "[^A-Za-zäöüÄÖÜß'.,?]", "repl": " "} + - {"pattern": ' ', "repl": " "} + test_cases: + - {input: {text: "' jupiter and venus both shining in the golden rosy sky"}, output: {text: "jupiter and venus both shining in the golden rosy sky"}} + - {input: {text: "' may all the gold i have ever dreamed of be yours '"}, output: {text: "may all the gold i have ever dreamed of be yours"}} + - {input: {text: "''cause it''s an adult novel versus ya"}, output: {text: "cause it's an adult novel versus ya"}} + + + - _target_: sdp.processors.DropIfRegexMatch + output_manifest_file: ${workspace_dir}/manifest12.json + text_key: text + regex_patterns: + - "^\\s*$" + + - _target_: sdp.processors.datasets.cc.cc.BLEUScore + output_manifest_file: ${workspace_dir}/manifest13.json + ref_field: text + hyp_field: pred_text + output_field: bleu + + - _target_: sdp.processors.datasets.cc.cc.UseSonar + output_manifest_file: ${workspace_dir}/manifest14.json + input_text_field: text + input_audio_field: audio_filepath + output_field: sonar_dist + device: cuda + speech_encoder_model: sonar_speech_encoder_deu + text_encoder_model: text_sonar_basic_encoder + text_encoder_lang: eng_Latn + batch_size: 64 + + - _target_: sdp.processors.datasets.cc.cc.PreserveByValue + output_manifest_file: ${workspace_dir}/manifest15s.json + input_field: sonar_dist + target_value: 0.1 + operator: le + + # - _target_: sdp.processors.datasets.cc.cc.PreserveByValue + # output_manifest_file: ${workspace_dir}/manifest15.json + # input_field: bleu + # target_value: 10 + # operator: ge \ No newline at end of file diff --git a/dataset_configs/commoncrawl/big_en.yaml b/dataset_configs/commoncrawl/big_en.yaml new file mode 100644 index 00000000..ef737ef5 --- /dev/null +++ b/dataset_configs/commoncrawl/big_en.yaml @@ -0,0 +1,202 @@ +processors_to_run: "3:" +workspace_dir: /mnt/md0/common_crawl/cc_sdp/en + +processors: + - _target_: sdp.processors.datasets.cc.cc.PreserveByValue + input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8ps.json + output_manifest_file: ${workspace_dir}/manifest0.json + input_field: audio_lang + target_value: en + + - _target_: sdp.processors.datasets.cc.cc.PreserveByValue + output_manifest_file: ${workspace_dir}/manifest1.json + input_field: text_lang + target_value: en + + - _target_: sdp.processors.ASRInference + output_manifest_file: ${workspace_dir}/manifest2.json + pretrained_model: nvidia/stt_en_fastconformer_hybrid_large_pc + batch_size: 64 + + - _target_: sdp.processors.DropIfRegexMatch + output_manifest_file: ${workspace_dir}/manifest3.json + regex_patterns: + # - '://' + # - "(\\s)+(www)\\.[a-zA-Z0-9/]+(\\s|$)+" + # - '\\x' + - "www\\.wiki" + - "www\\.usgs\\." + # - 'é' + # - 'ô' + # - '×' + # - 'š' + # - 'ö' + # - 'ß' + # - 'ä' + # - 'ü' + # - '\u202a' + # - 'č' + # - 'ć' + # - 'á' + # - 'ã' + # - 'â' + # - 'ï' + # - '\u2060' + # - 'ñ' + # - 'ŵ' + # - 'à' + # - 'ù' + # - 'ò' + # - 'ó' + # - 'ő' + # - 'ê' + # - 'ă' + # - 'ú' + # - 'µ' + # - '¿' + # - '¡' + # - 'ë' + # - "è" + # - "é" + # - "È" + # - "É" + + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest4.json + duplicate_fields: {"text":"orig_text"} + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest5.json + regex_params_list: + - {"pattern": '\[(.*?)\]', "repl": ' '} + - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"} + - {"pattern": 'î', "repl": "i"} + - {"pattern": 'ì', "repl": "i"} + - {"pattern": 'í', "repl": "i"} + - {"pattern": '‚', "repl": ","} + - {"pattern": "’", "repl": "'"} + - {"pattern": "[-–—]", "repl": " "} + - {"pattern": '―', "repl": "-"} + - {"pattern": '—', "repl": "-"} + - {"pattern": '⁺', "repl": "+"} + - {"pattern": '“', "repl": '"'} + - {"pattern": '”', "repl": '"'} + - {"pattern": '…', "repl": '.'} + - {"pattern": '‘', "repl": "'"} + - {"pattern": '′', "repl": "'"} + - {"pattern": '`', "repl": "'"} + - {"pattern": '⁻', "repl": "-"} + - {"pattern": '‑', "repl": "-"} + - {"pattern": '¶', "repl": ' '} + - {"pattern": '«', "repl": '"'} + - {"pattern": '»', "repl": '"'} + - {"pattern": '„', "repl": '"'} + - {"pattern": '®', "repl": ' '} + # - {"pattern": "%", "repl": ' '} + - {"pattern": '@', "repl": " "} + - {"pattern": " ", "repl": " "} + + - _target_: sdp.processors.DropHighLowWordrate + output_manifest_file: ${workspace_dir}/manifest6.json + high_wordrate_threshold: 100 + low_wordrate_threshold: 0.01 + + - _target_: sdp.processors.DropIfRegexMatch + output_manifest_file: ${workspace_dir}/manifest7.json + regex_patterns: + - "^\\s*$" + + - _target_: sdp.processors.datasets.cc.cc.Subprocess + output_manifest_file: ${workspace_dir}/manifest8.json + input_manifest_arg: "--manifest" + output_manifest_arg: "--output_filename" + arg_separator: "=" + cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \ + --language=en --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache --overwrite_cache \ + --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv" + + # - _target_: sdp.processors.datasets.cc.cc.Subprocess + # input_manifest_file: ${workspace_dir}/manifest6.json + # output_manifest_file: ${workspace_dir}/manifest7.json + # input_manifest_arg: "--input_file" + # output_manifest_arg: "--output_file" + # arg_separator: "=" + # cmd: ""python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize.py \ + # --language=es --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache \ + # --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv" + + - _target_: sdp.processors.RenameFields + output_manifest_file: ${workspace_dir}/manifest9.json + rename_fields: {"normalized":"text"} + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest10.json + text_key: text + regex_params_list: + - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"} + - {"pattern": "^\\s*'*\\s*", "repl": ""} + - {"pattern": "'{2,}", "repl": "'"} + - {"pattern": '\\.{3}', "repl": '.'} + - {"pattern": '!', "repl": '.'} + - {"pattern": '\$', "repl": ""} + - {"pattern": "[^A-Za-z'.,?]", "repl": " "} + - {"pattern": ' ', "repl": " "} + test_cases: + - {input: {text: "' jupiter and venus both shining in the golden rosy sky"}, output: {text: "jupiter and venus both shining in the golden rosy sky"}} + - {input: {text: "' may all the gold i have ever dreamed of be yours '"}, output: {text: "may all the gold i have ever dreamed of be yours"}} + - {input: {text: "''cause it''s an adult novel versus ya"}, output: {text: "cause it's an adult novel versus ya"}} + + - _target_: sdp.processors.DropHighLowWordrate + output_manifest_file: ${workspace_dir}/manifest11.json + high_wordrate_threshold: 100 + low_wordrate_threshold: 0.01 + + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest12.json + duplicate_fields: {"text":"text_pc"} + + - _target_: sdp.processors.SubMakeLowercase + output_manifest_file: ${workspace_dir}/manifest13.json + text_key: text + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest14.json + text_key: text + regex_params_list: + - {"pattern": "[\\?\\.]", "repl": " "} + - {"pattern": ",", "repl": " "} + - {"pattern": " ", "repl": " "} + + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest15.json + duplicate_fields: {"pred_text":"pred_text_pc"} + + - _target_: sdp.processors.SubMakeLowercase + output_manifest_file: ${workspace_dir}/manifest16.json + text_key: pred_text + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest17.json + text_key: pred_text + regex_params_list: + - {"pattern": "[\\?\\.]", "repl": " "} + - {"pattern": ",", "repl": " "} + - {"pattern": " ", "repl": " "} + + - _target_: sdp.processors.DropIfRegexMatch + output_manifest_file: ${workspace_dir}/manifest18.json + regex_patterns: + - "^\\s*$" + + - _target_: sdp.processors.DropHighWER + output_manifest_file: ${workspace_dir}/manifest19.json + text_key: text + pred_text_key: pred_text + wer_threshold: 75 + + - _target_: sdp.processors.DropHighCER + output_manifest_file: ${workspace_dir}/manifest20.json + text_key: text + pred_text_key: pred_text + cer_threshold: 30 + \ No newline at end of file diff --git a/dataset_configs/commoncrawl/big_en_de.yaml b/dataset_configs/commoncrawl/big_en_de.yaml new file mode 100644 index 00000000..341b1f69 --- /dev/null +++ b/dataset_configs/commoncrawl/big_en_de.yaml @@ -0,0 +1,131 @@ +processors_to_run: "15:" +workspace_dir: /mnt/md0/common_crawl/cc_sdp/en_de + +processors: + - _target_: sdp.processors.datasets.cc.cc.PreserveByValue + input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8ps.json + output_manifest_file: ${workspace_dir}/manifest0.json + input_field: audio_lang + target_value: en + + - _target_: sdp.processors.datasets.cc.cc.PreserveByValue + output_manifest_file: ${workspace_dir}/manifest1.json + input_field: text_lang + target_value: de + + - _target_: sdp.processors.ASRInference + output_manifest_file: ${workspace_dir}/manifest2.json + pretrained_model: nvidia/stt_en_fastconformer_hybrid_large_pc + batch_size: 64 + + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest3.json + duplicate_fields: {"text":"orig_text"} + + - _target_: sdp.processors.RenameFields + output_manifest_file: ${workspace_dir}/manifest4.json + rename_fields: {"pred_text": "asr_text"} + + - _target_: sdp.processors.DropIfRegexMatch + output_manifest_file: ${workspace_dir}/manifest5.json + text_key: asr_text + regex_patterns: + - "^\\s*$" + + - _target_: sdp.processors.datasets.cc.cc.NmtSubprocess + output_manifest_file: ${workspace_dir}/manifest6.json + arg_separator: "=" + srctext_file: ${workspace_dir}/srctext.txt # --srctext=${workspace_dir}/srctext.txt + tgtout_file: ${workspace_dir}/tgtout.txt # --tgtout=${workspace_dir}/tgtout.txt + input_field: "asr_text" + output_field: "pred_text" + cmd: "python /home/nkarpov/workspace/NeMo/examples/nlp/machine_translation/nmt_transformer_infer.py \ + --model=${workspace_dir}/nmt_en_de_transformer12x2.nemo --target_lang=de --source_lang=en" + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest7.json + regex_params_list: + - {"pattern": '\[(.*?)\]', "repl": ' '} + - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"} + - {"pattern": 'î', "repl": "i"} + - {"pattern": 'ì', "repl": "i"} + - {"pattern": 'í', "repl": "i"} + - {"pattern": '¡', "repl": "i"} + - {"pattern": '‚', "repl": ","} + - {"pattern": "’", "repl": "'"} + - {"pattern": "[-–—]", "repl": " "} + - {"pattern": '―', "repl": "-"} + - {"pattern": '—', "repl": "-"} + - {"pattern": '⁺', "repl": "+"} + - {"pattern": '“', "repl": '"'} + - {"pattern": '”', "repl": '"'} + - {"pattern": '…', "repl": '.'} + - {"pattern": '‘', "repl": "'"} + - {"pattern": '′', "repl": "'"} + - {"pattern": '`', "repl": "'"} + - {"pattern": '⁻', "repl": "-"} + - {"pattern": '‑', "repl": "-"} + - {"pattern": '¶', "repl": ' '} + - {"pattern": '«', "repl": '"'} + - {"pattern": '»', "repl": '"'} + - {"pattern": '„', "repl": '"'} + - {"pattern": '®', "repl": ' '} + - {"pattern": '@', "repl": ' '} + - {"pattern": " ", "repl": " "} + + - _target_: sdp.processors.DropHighLowWordrate + output_manifest_file: ${workspace_dir}/manifest8.json + high_wordrate_threshold: 100 + low_wordrate_threshold: 0.01 + + - _target_: sdp.processors.datasets.cc.cc.Subprocess + output_manifest_file: ${workspace_dir}/manifest9.json + input_manifest_arg: "--manifest" + output_manifest_arg: "--output_filename" + arg_separator: "=" + cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \ + --language=de --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache --overwrite_cache \ + --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/de/data/whitelist.tsv" + + - _target_: sdp.processors.RenameFields + output_manifest_file: ${workspace_dir}/manifest10.json + rename_fields: {"normalized":"text"} + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest11.json + text_key: text + regex_params_list: + - {"pattern": 'ç', "repl": "c"} + - {"pattern": '\\.{3}', "repl": '.'} + - {"pattern": '\$', "repl": ""} + - {"pattern": "[^A-Za-zäöüÄÖÜß.,?]", "repl": " "} + - {"pattern": ' ', "repl": " "} + + - _target_: sdp.processors.DropIfRegexMatch + output_manifest_file: ${workspace_dir}/manifest12.json + text_key: text + regex_patterns: + - "^\\s*$" + + - _target_: sdp.processors.datasets.cc.cc.BLEUScore + output_manifest_file: ${workspace_dir}/manifest13.json + ref_field: text + hyp_field: pred_text + output_field: bleu + + - _target_: sdp.processors.datasets.cc.cc.UseSonar + output_manifest_file: ${workspace_dir}/manifest14.json + input_text_field: text + input_audio_field: audio_filepath + output_field: sonar_dist + device: cuda + speech_encoder_model: sonar_speech_encoder_eng + text_encoder_model: text_sonar_basic_encoder + text_encoder_lang: deu_Latn + batch_size: 64 + + - _target_: sdp.processors.datasets.cc.cc.PreserveByValue + output_manifest_file: ${workspace_dir}/manifest15.json + input_field: bleu + target_value: 30 + operator: ge diff --git a/dataset_configs/commoncrawl/big_en_fr.yaml b/dataset_configs/commoncrawl/big_en_fr.yaml new file mode 100644 index 00000000..d8476d27 --- /dev/null +++ b/dataset_configs/commoncrawl/big_en_fr.yaml @@ -0,0 +1,122 @@ +processors_to_run: "12:" +workspace_dir: /mnt/md0/common_crawl/cc_sdp/en_fr + +processors: + - _target_: sdp.processors.datasets.cc.cc.PreserveByValue + input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8ps.json + output_manifest_file: ${workspace_dir}/manifest0.json + input_field: audio_lang + target_value: en + + - _target_: sdp.processors.datasets.cc.cc.PreserveByValue + output_manifest_file: ${workspace_dir}/manifest1.json + input_field: text_lang + target_value: fr + + - _target_: sdp.processors.ASRInference + output_manifest_file: ${workspace_dir}/manifest2.json + pretrained_model: nvidia/stt_en_fastconformer_hybrid_large_pc + batch_size: 64 + + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest3.json + duplicate_fields: {"text":"orig_text"} + + - _target_: sdp.processors.RenameFields + output_manifest_file: ${workspace_dir}/manifest4.json + rename_fields: {"pred_text": "asr_text"} + + - _target_: sdp.processors.DropIfRegexMatch + output_manifest_file: ${workspace_dir}/manifest5.json + text_key: asr_text + regex_patterns: + - "^\\s*$" + + - _target_: sdp.processors.datasets.cc.cc.NmtSubprocess + output_manifest_file: ${workspace_dir}/manifest6.json + arg_separator: "=" + srctext_file: ${workspace_dir}/srctext.txt # --srctext=${workspace_dir}/srctext.txt + tgtout_file: ${workspace_dir}/tgtout.txt # --tgtout=${workspace_dir}/tgtout.txt + input_field: "asr_text" + output_field: "pred_text" + cmd: "python /home/nkarpov/workspace/NeMo/examples/nlp/machine_translation/nmt_transformer_infer.py \ + --model=${workspace_dir}/nmt_en_fr_transformer12x2.nemo --target_lang=fr --source_lang=en" + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest7.json + regex_params_list: + - {"pattern": '\[(.*?)\]', "repl": ' '} + - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"} + - {"pattern": '‚', "repl": ","} + - {"pattern": "’", "repl": "'"} + - {"pattern": "[-–—]", "repl": " "} + - {"pattern": '―', "repl": "-"} + - {"pattern": '—', "repl": "-"} + - {"pattern": '⁺', "repl": "+"} + - {"pattern": '“', "repl": '"'} + - {"pattern": '”', "repl": '"'} + - {"pattern": '…', "repl": '.'} + - {"pattern": '‘', "repl": "'"} + - {"pattern": '′', "repl": "'"} + - {"pattern": '`', "repl": "'"} + - {"pattern": '⁻', "repl": "-"} + - {"pattern": '‑', "repl": "-"} + - {"pattern": '¶', "repl": ' '} + - {"pattern": '«', "repl": '"'} + - {"pattern": '»', "repl": '"'} + - {"pattern": '„', "repl": '"'} + - {"pattern": '®', "repl": ' '} + - {"pattern": '@', "repl": " "} + # - {"pattern": "%", "repl": " "} + - {"pattern": " ", "repl": " "} + + - _target_: sdp.processors.DropHighLowWordrate + output_manifest_file: ${workspace_dir}/manifest8.json + high_wordrate_threshold: 100 + low_wordrate_threshold: 0.01 + + # - _target_: sdp.processors.datasets.cc.cc.Subprocess + # # input_manifest_file: ${workspace_dir}/manifest7.json + # output_manifest_file: ${workspace_dir}/manifest10.json + # input_manifest_arg: "--manifest" + # output_manifest_arg: "--output_filename" + # arg_separator: "=" + # cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \ + # --language=fr --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache --overwrite_cache \ + # --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/fr/data/whitelist.tsv" + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest9.json + text_key: text + regex_params_list: + - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"} + - {"pattern": "^\\s*'*\\s*", "repl": ""} + - {"pattern": "'{2,}", "repl": "'"} + - {"pattern": '\\.{3}', "repl": '.'} + - {"pattern": '\$', "repl": ""} + - {"pattern": "[^a-zA-ZàâçéèêëîïôûùüÿñæœÀÂÇÉÈÊËÎÏÔÛÙÜŸÑÆŒ.,?'-]", "repl": " "} + - {"pattern": ' ', "repl": " "} + + + - _target_: sdp.processors.datasets.cc.cc.BLEUScore + output_manifest_file: ${workspace_dir}/manifest10.json + ref_field: text + hyp_field: pred_text + output_field: bleu + + - _target_: sdp.processors.datasets.cc.cc.UseSonar + output_manifest_file: ${workspace_dir}/manifest11.json + input_text_field: text + input_audio_field: audio_filepath + output_field: sonar_dist + device: cuda + speech_encoder_model: sonar_speech_encoder_eng + text_encoder_model: text_sonar_basic_encoder + text_encoder_lang: fra_Latn + batch_size: 64 + + - _target_: sdp.processors.datasets.cc.cc.PreserveByValue + output_manifest_file: ${workspace_dir}/manifest12.json + input_field: bleu + target_value: 30 + operator: ge diff --git a/dataset_configs/commoncrawl/big_es.yaml b/dataset_configs/commoncrawl/big_es.yaml new file mode 100644 index 00000000..b148b857 --- /dev/null +++ b/dataset_configs/commoncrawl/big_es.yaml @@ -0,0 +1,155 @@ +processors_to_run: "4:" +workspace_dir: /mnt/md0/common_crawl/cc_sdp/es + +processors: + - _target_: sdp.processors.datasets.cc.cc.PreserveByValue + input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8ps.json + output_manifest_file: ${workspace_dir}/manifest0.json + input_field: audio_lang + target_value: es + + - _target_: sdp.processors.datasets.cc.cc.PreserveByValue + output_manifest_file: ${workspace_dir}/manifest1.json + input_field: text_lang + target_value: es + + - _target_: sdp.processors.ASRInference + output_manifest_file: ${workspace_dir}/manifest2.json + pretrained_model: nvidia/stt_es_fastconformer_hybrid_large_pc + batch_size: 64 + + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest3.json + duplicate_fields: {"text":"orig_text"} + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest4.json + text_key: text + regex_params_list: + - {"pattern": '\[(.*?)\]', "repl": ' '} + - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"} + - {"pattern": 'î', "repl": "i"} + - {"pattern": 'ì', "repl": "í"} + - {"pattern": 'è', "repl": "é"} + - {"pattern": 'È', "repl": "É"} + - {"pattern": 'ù', "repl": "ú"} + - {"pattern": 'ò', "repl": "ó"} + - {"pattern": 'à', "repl": "á"} + - {"pattern": '‚', "repl": ","} + - {"pattern": "’", "repl": "'"} + - {"pattern": "[-–—]", "repl": " "} + - {"pattern": '―', "repl": "-"} + - {"pattern": '—', "repl": "-"} + - {"pattern": '⁺', "repl": "+"} + - {"pattern": '“', "repl": '"'} + - {"pattern": '”', "repl": '"'} + - {"pattern": '…', "repl": '.'} + - {"pattern": '‘', "repl": "'"} + - {"pattern": '′', "repl": "'"} + - {"pattern": '`', "repl": "'"} + - {"pattern": '⁻', "repl": "-"} + - {"pattern": '‑', "repl": "-"} + - {"pattern": '¶', "repl": ' '} + - {"pattern": '«', "repl": '"'} + - {"pattern": '»', "repl": '"'} + - {"pattern": '„', "repl": '"'} + - {"pattern": '®', "repl": ' '} + - {"pattern": '•', "repl": " "} + - {"pattern": '●', "repl": " "} + - {"pattern": '@', "repl": " "} + - {"pattern": " ", "repl": " "} + + - _target_: sdp.processors.DropHighLowWordrate + output_manifest_file: ${workspace_dir}/manifest5.json + text_key: text + high_wordrate_threshold: 100 + low_wordrate_threshold: 0.01 + + - _target_: sdp.processors.DropIfRegexMatch + output_manifest_file: ${workspace_dir}/manifest6.json + text_key: text + regex_patterns: + - "^\\s*$" + + - _target_: sdp.processors.datasets.cc.cc.Subprocess + output_manifest_file: ${workspace_dir}/manifest7.json + input_manifest_arg: "--manifest" + output_manifest_arg: "--output_filename" + arg_separator: "=" + cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \ + --language=es --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache \ + --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/es/data/whitelist.tsv" + # --overwrite_cache + + - _target_: sdp.processors.RenameFields + output_manifest_file: ${workspace_dir}/manifest8.json + rename_fields: {"normalized":"text"} + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest9.json + text_key: text + regex_params_list: + - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"} + - {"pattern": "^\\s*'*\\s*", "repl": ""} + - {"pattern": "'{2,}", "repl": "'"} + - {"pattern": '!', "repl": '.'} + - {"pattern": '\\.{3}', "repl": '.'} + - {"pattern": '\$', "repl": ""} + - {"pattern": "[^a-zA-ZóÓáéÉíÍñÑúÚ'.,?¿]", "repl": " "} + - {"pattern": ' ', "repl": " "} + test_cases: + - {input: {text: "' jupiter and venus both shining in the golden rosy sky"}, output: {text: "jupiter and venus both shining in the golden rosy sky"}} + - {input: {text: "' may all the gold i have ever dreamed of be yours '"}, output: {text: "may all the gold i have ever dreamed of be yours"}} + - {input: {text: "''cause it''s an adult novel versus ya"}, output: {text: "cause it's an adult novel versus ya"}} + + + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest10.json + duplicate_fields: {"text":"text_pc"} + + - _target_: sdp.processors.SubMakeLowercase + output_manifest_file: ${workspace_dir}/manifest11.json + text_key: text + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest12.json + text_key: text + regex_params_list: + - {"pattern": "[\\?\\.]", "repl": " "} + - {"pattern": ",", "repl": " "} + - {"pattern": " ", "repl": " "} + + - _target_: sdp.processors.DropIfRegexMatch + output_manifest_file: ${workspace_dir}/manifest13.json + text_key: text + regex_patterns: + - "^\\s*$" + + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest14.json + duplicate_fields: {"pred_text":"pred_text_pc"} + + - _target_: sdp.processors.SubMakeLowercase + output_manifest_file: ${workspace_dir}/manifest15.json + text_key: pred_text + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest16.json + text_key: pred_text + regex_params_list: + - {"pattern": "[\\?\\.]", "repl": " "} + - {"pattern": ",", "repl": " "} + - {"pattern": " ", "repl": " "} + + - _target_: sdp.processors.DropHighWER + output_manifest_file: ${workspace_dir}/manifest17.json + text_key: text + pred_text_key: pred_text + wer_threshold: 75 + + - _target_: sdp.processors.DropHighCER + output_manifest_file: ${workspace_dir}/manifest18.json + text_key: text + pred_text_key: pred_text + cer_threshold: 30 + \ No newline at end of file diff --git a/dataset_configs/commoncrawl/big_fr.yaml b/dataset_configs/commoncrawl/big_fr.yaml new file mode 100644 index 00000000..898880bb --- /dev/null +++ b/dataset_configs/commoncrawl/big_fr.yaml @@ -0,0 +1,145 @@ +processors_to_run: "8:" +workspace_dir: /mnt/md0/common_crawl/cc_sdp/fr + +processors: + - _target_: sdp.processors.datasets.cc.cc.PreserveByValue + input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8ps.json + output_manifest_file: ${workspace_dir}/manifest0.json + input_field: audio_lang + target_value: fr + + - _target_: sdp.processors.datasets.cc.cc.PreserveByValue + output_manifest_file: ${workspace_dir}/manifest1.json + input_field: text_lang + target_value: fr + + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest2.json + duplicate_fields: {"text":"orig_text"} + + - _target_: sdp.processors.ASRInference + output_manifest_file: ${workspace_dir}/manifest3.json + pretrained_model: nvidia/stt_fr_conformer_transducer_large #stt_fr_fastconformer_hybrid_large_pc + batch_size: 64 + + - _target_: sdp.processors.DropIfRegexMatch + output_manifest_file: ${workspace_dir}/manifest4.json + regex_patterns: + - '\\x' + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest5.json + text_key: text + regex_params_list: + - {"pattern": '\[(.*?)\]', "repl": ' '} + - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"} + - {"pattern": "\\\\x[a-f\\d]{1,}", "repl": " "} + - {"pattern": '‚', "repl": ","} + - {"pattern": "’", "repl": "'"} + - {"pattern": "[-–—]", "repl": " "} + - {"pattern": '―', "repl": "-"} + - {"pattern": '—', "repl": "-"} + - {"pattern": '⁺', "repl": "+"} + - {"pattern": '“', "repl": '"'} + - {"pattern": '”', "repl": '"'} + - {"pattern": '…', "repl": '.'} + - {"pattern": '‘', "repl": "'"} + - {"pattern": '′', "repl": "'"} + - {"pattern": '`', "repl": "'"} + - {"pattern": '⁻', "repl": "-"} + - {"pattern": '‑', "repl": "-"} + - {"pattern": '¶', "repl": ' '} + - {"pattern": '«', "repl": '"'} + - {"pattern": '»', "repl": '"'} + - {"pattern": '„', "repl": '"'} + - {"pattern": '®', "repl": ' '} + - {"pattern": '@', "repl": " "} + - {"pattern": " ", "repl": " "} + + - _target_: sdp.processors.DropHighLowWordrate + output_manifest_file: ${workspace_dir}/manifest6.json + text_key: text + high_wordrate_threshold: 100 + low_wordrate_threshold: 0.01 + + - _target_: sdp.processors.DropIfRegexMatch + output_manifest_file: ${workspace_dir}/manifest7.json + text_key: text + regex_patterns: + - "^\\s*$" + + - _target_: sdp.processors.datasets.cc.cc.Subprocess + output_manifest_file: ${workspace_dir}/manifest8.json + input_manifest_arg: "--manifest" + output_manifest_arg: "--output_filename" + arg_separator: "=" + cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \ + --language=es --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache \ + --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/fr/data/whitelist.tsv" + # --overwrite_cache + + - _target_: sdp.processors.RenameFields + output_manifest_file: ${workspace_dir}/manifest9.json + rename_fields: {"normalized":"text"} + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest10.json + regex_params_list: + - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"} + - {"pattern": "^\\s*'*\\s*", "repl": ""} + - {"pattern": "'{2,}", "repl": "'"} + - {"pattern": '\\.{3}', "repl": '.'} + - {"pattern": '\$', "repl": ""} + - {"pattern": "[^a-zA-ZàâçéèêëîïôûùüÿñæœÀÂÇÉÈÊËÎÏÔÛÙÜŸÑÆŒ.,?'-]", "repl": " "} + - {"pattern": ' ', "repl": " "} + + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest11.json + duplicate_fields: {"text":"text_pc"} + + - _target_: sdp.processors.SubMakeLowercase + output_manifest_file: ${workspace_dir}/manifest12.json + text_key: text + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest13.json + text_key: text + regex_params_list: + - {"pattern": "[\\?\\.]", "repl": " "} + - {"pattern": ",", "repl": " "} + - {"pattern": " ", "repl": " "} + + - _target_: sdp.processors.DropIfRegexMatch + output_manifest_file: ${workspace_dir}/manifest14.json + text_key: text + regex_patterns: + - "^\\s*$" + + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest15.json + duplicate_fields: {"pred_text":"pred_text_pc"} + + - _target_: sdp.processors.SubMakeLowercase + output_manifest_file: ${workspace_dir}/manifest16.json + text_key: pred_text + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest17.json + text_key: pred_text + regex_params_list: + - {"pattern": "[\\?\\.]", "repl": " "} + - {"pattern": ",", "repl": " "} + - {"pattern": " ", "repl": " "} + + - _target_: sdp.processors.DropHighWER + output_manifest_file: ${workspace_dir}/manifest18.json + text_key: text + pred_text_key: pred_text + wer_threshold: 75 + + - _target_: sdp.processors.DropHighCER + output_manifest_file: ${workspace_dir}/manifest19.json + text_key: text + pred_text_key: pred_text + cer_threshold: 30 + \ No newline at end of file diff --git a/dataset_configs/commoncrawl/big_fr_en.yaml b/dataset_configs/commoncrawl/big_fr_en.yaml new file mode 100644 index 00000000..d8473315 --- /dev/null +++ b/dataset_configs/commoncrawl/big_fr_en.yaml @@ -0,0 +1,138 @@ +processors_to_run: "14:" +workspace_dir: /mnt/md0/common_crawl/cc_sdp/fr_en + +processors: + - _target_: sdp.processors.datasets.cc.cc.PreserveByValue + input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8ps.json + output_manifest_file: ${workspace_dir}/manifest0.json + input_field: audio_lang + target_value: fr + + - _target_: sdp.processors.datasets.cc.cc.PreserveByValue + output_manifest_file: ${workspace_dir}/manifest1.json + input_field: text_lang + target_value: en + + - _target_: sdp.processors.ASRInference + output_manifest_file: ${workspace_dir}/manifest2.json + pretrained_model: nvidia/stt_fr_fastconformer_hybrid_large_pc #stt_fr_conformer_transducer_large + batch_size: 64 + + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest3.json + duplicate_fields: {"text":"orig_text"} + + - _target_: sdp.processors.RenameFields + output_manifest_file: ${workspace_dir}/manifest4.json + rename_fields: {"pred_text": "asr_text"} + + - _target_: sdp.processors.DropIfRegexMatch + output_manifest_file: ${workspace_dir}/manifest5.json + text_key: asr_text + regex_patterns: + - "^\\s*$" + + - _target_: sdp.processors.datasets.cc.cc.NmtSubprocess + output_manifest_file: ${workspace_dir}/manifest6.json + arg_separator: "=" + srctext_file: ${workspace_dir}/srctext.txt # --srctext=${workspace_dir}/srctext.txt + tgtout_file: ${workspace_dir}/tgtout.txt # --tgtout=${workspace_dir}/tgtout.txt + input_field: "asr_text" + output_field: "pred_text" + cmd: "python /home/nkarpov/workspace/NeMo/examples/nlp/machine_translation/nmt_transformer_infer.py \ + --model=${workspace_dir}/nmt_fr_en_transformer12x2.nemo --target_lang=en --source_lang=fr" + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifes7.json + regex_params_list: + - {"pattern": '\[(.*?)\]', "repl": ' '} + - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"} + - {"pattern": 'î', "repl": "i"} + - {"pattern": 'ì', "repl": "i"} + - {"pattern": 'í', "repl": "i"} + - {"pattern": '‚', "repl": ","} + - {"pattern": "’", "repl": "'"} + - {"pattern": "[-–—]", "repl": " "} + - {"pattern": '―', "repl": "-"} + - {"pattern": '—', "repl": "-"} + - {"pattern": '⁺', "repl": "+"} + - {"pattern": '“', "repl": '"'} + - {"pattern": '”', "repl": '"'} + - {"pattern": '…', "repl": '.'} + - {"pattern": '‘', "repl": "'"} + - {"pattern": '′', "repl": "'"} + - {"pattern": '`', "repl": "'"} + - {"pattern": '⁻', "repl": "-"} + - {"pattern": '‑', "repl": "-"} + - {"pattern": '¶', "repl": ' '} + - {"pattern": '«', "repl": '"'} + - {"pattern": '»', "repl": '"'} + - {"pattern": '„', "repl": '"'} + - {"pattern": '®', "repl": ' '} + - {"pattern": '@', "repl": " "} + - {"pattern": " ", "repl": " "} + + - _target_: sdp.processors.DropHighLowWordrate + output_manifest_file: ${workspace_dir}/manifest8.json + high_wordrate_threshold: 100 + low_wordrate_threshold: 0.01 + + - _target_: sdp.processors.datasets.cc.cc.Subprocess + output_manifest_file: ${workspace_dir}/manifest9.json + input_manifest_arg: "--manifest" + output_manifest_arg: "--output_filename" + arg_separator: "=" + cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \ + --language=en --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache \ + --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv" + # --overwrite_cache + + - _target_: sdp.processors.RenameFields + output_manifest_file: ${workspace_dir}/manifest10.json + rename_fields: {"normalized":"text"} + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest11.json + text_key: text + regex_params_list: + - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"} + - {"pattern": "^\\s*'*\\s*", "repl": ""} + - {"pattern": "'{2,}", "repl": "'"} + - {"pattern": '\\.{3}', "repl": '.'} + - {"pattern": '!', "repl": '.'} + - {"pattern": '\$', "repl": ""} + - {"pattern": "[^A-Za-zàâçéèêëîïôûùüÿñæœÀÂÇÉÈÊËÎÏÔÛÙÜŸÑÆŒ.,?'-]", "repl": " "} + - {"pattern": ' ', "repl": " "} + test_cases: + - {input: {text: "' jupiter and venus both shining in the golden rosy sky"}, output: {text: "jupiter and venus both shining in the golden rosy sky"}} + - {input: {text: "' may all the gold i have ever dreamed of be yours '"}, output: {text: "may all the gold i have ever dreamed of be yours"}} + - {input: {text: "''cause it''s an adult novel versus ya"}, output: {text: "cause it's an adult novel versus ya"}} + + - _target_: sdp.processors.DropIfRegexMatch + output_manifest_file: ${workspace_dir}/manifest12.json + text_key: text + regex_patterns: + - "^\\s*$" + + - _target_: sdp.processors.datasets.cc.cc.BLEUScore + output_manifest_file: ${workspace_dir}/manifest13.json + ref_field: text + hyp_field: pred_text + output_field: bleu + + - _target_: sdp.processors.datasets.cc.cc.UseSonar + output_manifest_file: ${workspace_dir}/manifest14.json + input_text_field: text + input_audio_field: audio_filepath + output_field: sonar_dist + device: cuda + speech_encoder_model: sonar_speech_encoder_fra + text_encoder_model: text_sonar_basic_encoder + text_encoder_lang: eng_Latn + batch_size: 64 + + - _target_: sdp.processors.datasets.cc.cc.PreserveByValue + output_manifest_file: ${workspace_dir}/manifest15.json + input_field: bleu + target_value: 10 + operator: ge diff --git a/dataset_configs/commoncrawl/big_pl.yaml b/dataset_configs/commoncrawl/big_pl.yaml new file mode 100644 index 00000000..38211cc7 --- /dev/null +++ b/dataset_configs/commoncrawl/big_pl.yaml @@ -0,0 +1,125 @@ +processors_to_run: "0:" +workspace_dir: /mnt/md0/common_crawl/cc_sdp/pl + +processors: + - _target_: sdp.processors.datasets.cc.cc.PreserveByValue + input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8ps.json + output_manifest_file: ${workspace_dir}/manifest0.json + input_field: audio_lang + target_value: pl + + - _target_: sdp.processors.datasets.cc.cc.PreserveByValue + output_manifest_file: ${workspace_dir}/manifest1.json + input_field: text_lang + target_value: pl + + - _target_: sdp.processors.ASRInference + output_manifest_file: ${workspace_dir}/manifest2.json + pretrained_model: nvidia/stt_pl_fastconformer_hybrid_large_pc + batch_size: 64 + + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest3.json + duplicate_fields: {"text":"orig_text"} + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest4.json + text_key: text + regex_params_list: + - {"pattern": '\[(.*?)\]', "repl": ' '} + - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"} + - {"pattern": '‚', "repl": ","} + - {"pattern": "’", "repl": "'"} + - {"pattern": "[-–—]", "repl": " "} + - {"pattern": '―', "repl": "-"} + - {"pattern": '—', "repl": "-"} + - {"pattern": '⁺', "repl": "+"} + - {"pattern": '“', "repl": '"'} + - {"pattern": '”', "repl": '"'} + - {"pattern": '…', "repl": '.'} + - {"pattern": '‘', "repl": "'"} + - {"pattern": '′', "repl": "'"} + - {"pattern": '`', "repl": "'"} + - {"pattern": '⁻', "repl": "-"} + - {"pattern": '‑', "repl": "-"} + - {"pattern": '¶', "repl": ' '} + - {"pattern": '«', "repl": '"'} + - {"pattern": '»', "repl": '"'} + - {"pattern": '„', "repl": '"'} + - {"pattern": '®', "repl": ' '} + - {"pattern": '@', "repl": " "} + - {"pattern": " ", "repl": " "} + + - _target_: sdp.processors.DropHighLowWordrate + output_manifest_file: ${workspace_dir}/manifest5.json + high_wordrate_threshold: 100 + low_wordrate_threshold: 0.01 + + - _target_: sdp.processors.DropIfRegexMatch + output_manifest_file: ${workspace_dir}/manifest6.json + text_key: text + regex_patterns: + - "^\\s*$" + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest7.json + text_key: text + regex_params_list: + - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"} + - {"pattern": "^\\s*'*\\s*", "repl": ""} + - {"pattern": "'{2,}", "repl": "'"} + - {"pattern": '\\.{3}', "repl": '.'} + - {"pattern": '\$', "repl": ""} + - {"pattern": "[^a-zA-ZĘꥹłŁćĆżŻśŚŃńóÓźŹ.,?]", "repl": " "} + - {"pattern": ' ', "repl": " "} + + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest8.json + duplicate_fields: {"text":"text_pc"} + + - _target_: sdp.processors.SubMakeLowercase + output_manifest_file: ${workspace_dir}/manifest9.json + text_key: text + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest10.json + text_key: text + regex_params_list: + - {"pattern": "[\\?\\.]", "repl": " "} + - {"pattern": ",", "repl": " "} + - {"pattern": " ", "repl": " "} + + - _target_: sdp.processors.DropIfRegexMatch + output_manifest_file: ${workspace_dir}/manifest11.json + text_key: text + regex_patterns: + - "^\\s*$" + + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest12.json + duplicate_fields: {"pred_text":"pred_text_pc"} + + - _target_: sdp.processors.SubMakeLowercase + output_manifest_file: ${workspace_dir}/manifest13.json + text_key: pred_text + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest14.json + text_key: pred_text + regex_params_list: + - {"pattern": "[\\?\\.]", "repl": " "} + - {"pattern": ",", "repl": " "} + - {"pattern": " ", "repl": " "} + + - _target_: sdp.processors.DropHighWER + output_manifest_file: ${workspace_dir}/manifest15.json + text_key: text + pred_text_key: pred_text + wer_threshold: 75 + + - _target_: sdp.processors.DropHighCER + output_manifest_file: ${workspace_dir}/manifest16.json + text_key: text + pred_text_key: pred_text + cer_threshold: 30 + \ No newline at end of file diff --git a/dataset_configs/commoncrawl/big_sentence.yaml b/dataset_configs/commoncrawl/big_sentence.yaml new file mode 100644 index 00000000..6870d144 --- /dev/null +++ b/dataset_configs/commoncrawl/big_sentence.yaml @@ -0,0 +1,70 @@ +processors_to_run: "7:" +workspace_dir: /mnt/md1/common_crawl/cc_sdp +workspace_dir_s: /mnt/md0/common_crawl/cc_sdp + +processors: + - _target_: sdp.processors.datasets.cc.cc.CreateInitialManifestCC + raw_data_dir: /mnt/md0/common_crawl/output/video_output2 + output_manifest_file: ${workspace_dir}/manifest0.json + resampled_audio_dir: ${workspace_dir}/audio/ + target_samplerate: 16000 + target_nchannels: 1 + audio_field: "audios" + video_field: "videos" + key_field: "key" + text_field: "texts" + + - _target_: sdp.processors.datasets.cc.cc.TxtToVtt + input_manifest_file: ${workspace_dir}/manifest0.json + output_manifest_file: ${workspace_dir}/manifest1.json + vtt_files_dir: ${workspace_dir}/vtts/ + key_field: "key" + text_field: "texts" + vtt_field: "vtt_filepath" + + - _target_: sdp.processors.datasets.cc.cc.AllVttText + output_manifest_file: ${workspace_dir}/manifest2.json + input_filepath_field: vtt_filepath + output_text_field: vtt_text + + - _target_: sdp.processors.datasets.cc.cc.TextLid + output_manifest_file: ${workspace_dir}/manifest3.json + input_text_field: vtt_text + output_lang_field: text_lang + device: cuda + pretrained_model: "jb2k/bert-base-multilingual-cased-language-detection" + + - _target_: sdp.processors.datasets.cc.cc.Lang2Iso + output_manifest_file: ${workspace_dir}/manifest4.json + input_lang_field: text_lang + output_lang_field: text_lang + + - _target_: sdp.processors.datasets.cc.cc.AudioLid + output_manifest_file: ${workspace_dir}/manifest5.json + input_audio_field: audios + output_lang_field: audio_lang + device: cuda + pretrained_model: "langid_ambernet" + + - _target_: sdp.processors.datasets.cc.cc.ReadParquet + raw_data_dir: /mnt/md0/common_crawl/output/video_output2 + output_manifest_file: ${workspace_dir_s}/manifest6.json + output_video_field: video + output_vtt_field: caption + key_field: key + + - _target_: sdp.processors.datasets.cc.cc.SplitByVttSentence + output_manifest_file: ${workspace_dir_s}/manifest7.json + splited_audio_dir: ${workspace_dir_s}/splited/ + source_audio_field: audios + target_audio_field: audio_filepath + duration_field: duration + text_field: text + vtt_field: vtt_filepath + proxy_fields: [audio_lang, text_lang, video, caption] + duration_threshold: 10.0 + + - _target_: sdp.processors.DropHighLowDuration + output_manifest_file: ${workspace_dir_s}/manifest8.json + high_duration_threshold: 40 + low_duration_threshold: 0.02 From f450f421f5acd626ee9f420bd25a002196a88991 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Mon, 25 Sep 2023 03:15:14 -0700 Subject: [PATCH 008/115] small Signed-off-by: Nikolay Karpov --- dataset_configs/commoncrawl/small.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dataset_configs/commoncrawl/small.yaml b/dataset_configs/commoncrawl/small.yaml index a261dd39..d7a61254 100644 --- a/dataset_configs/commoncrawl/small.yaml +++ b/dataset_configs/commoncrawl/small.yaml @@ -1,4 +1,4 @@ -processors_to_run: "9:" +processors_to_run: "0:" workspace_dir: /mnt/ssd8/cc_sdp final_manifest: ${workspace_dir}/full_manifest.json group_duration_threshold: 20.0 @@ -76,5 +76,6 @@ processors: - _target_: sdp.processors.datasets.commoncrawl.ReadParquet raw_data_dir: /mnt/ssd8/cc_download/videos_and_transcripts_large_16_cores_output/video_output2 output_manifest_file: ${workspace_dir}/manifest8.json - output_text_field: url + output_video_field: video + output_vtt_field: caption key_field: key \ No newline at end of file From 1952828bf315f9630e458a61e770e5f085488e76 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Thu, 28 Sep 2023 21:38:39 -0700 Subject: [PATCH 009/115] yaml Signed-off-by: Nikolay Karpov --- dataset_configs/commoncrawl/big.yaml | 16 ++++++++-------- dataset_configs/commoncrawl/big_de.yaml | 12 ++++++------ dataset_configs/commoncrawl/big_de_en.yaml | 18 +++++++++--------- dataset_configs/commoncrawl/big_en.yaml | 8 ++++---- dataset_configs/commoncrawl/big_en_de.yaml | 16 ++++++++-------- dataset_configs/commoncrawl/big_en_fr.yaml | 14 +++++++------- dataset_configs/commoncrawl/big_es.yaml | 14 +++++++------- dataset_configs/commoncrawl/big_fr.yaml | 2 +- dataset_configs/commoncrawl/big_fr_en.yaml | 2 +- dataset_configs/commoncrawl/big_pl.yaml | 8 ++++---- dataset_configs/commoncrawl/big_sentence.yaml | 16 ++++++++-------- 11 files changed, 63 insertions(+), 63 deletions(-) diff --git a/dataset_configs/commoncrawl/big.yaml b/dataset_configs/commoncrawl/big.yaml index 7af211ea..ba34839d 100644 --- a/dataset_configs/commoncrawl/big.yaml +++ b/dataset_configs/commoncrawl/big.yaml @@ -2,7 +2,7 @@ processors_to_run: "0:" workspace_dir: /mnt/md1/common_crawl/cc_sdp processors: - - _target_: sdp.processors.datasets.cc.cc.CreateInitialManifestCC + - _target_: sdp.processors.datasets.commoncrawl.CreateInitialManifestCC raw_data_dir: /mnt/md0/common_crawl/output/video_output2 output_manifest_file: ${workspace_dir}/manifest0.json resampled_audio_dir: ${workspace_dir}/audio/ @@ -13,7 +13,7 @@ processors: key_field: "key" text_field: "texts" - - _target_: sdp.processors.datasets.cc.cc.TxtToVtt + - _target_: sdp.processors.datasets.commoncrawl.TxtToVtt input_manifest_file: ${workspace_dir}/manifest0.json output_manifest_file: ${workspace_dir}/manifest1.json vtt_files_dir: ${workspace_dir}/vtts/ @@ -21,13 +21,13 @@ processors: text_field: "texts" vtt_field: "vtt_filepath" - - _target_: sdp.processors.datasets.cc.cc.AllVttText + - _target_: sdp.processors.datasets.commoncrawl.AllVttText input_manifest_file: ${workspace_dir}/manifest1.json output_manifest_file: ${workspace_dir}/manifest2.json input_filepath_field: vtt_filepath output_text_field: vtt_text - - _target_: sdp.processors.datasets.cc.cc.TextLid + - _target_: sdp.processors.datasets.commoncrawl.TextLid input_manifest_file: ${workspace_dir}/manifest2.json output_manifest_file: ${workspace_dir}/manifest3.json input_text_field: vtt_text @@ -35,13 +35,13 @@ processors: device: cuda pretrained_model: "jb2k/bert-base-multilingual-cased-language-detection" - - _target_: sdp.processors.datasets.cc.cc.Lang2Iso + - _target_: sdp.processors.datasets.commoncrawl.Lang2Iso input_manifest_file: ${workspace_dir}/manifest3.json output_manifest_file: ${workspace_dir}/manifest4.json input_lang_field: text_lang output_lang_field: text_lang - - _target_: sdp.processors.datasets.cc.cc.AudioLid + - _target_: sdp.processors.datasets.commoncrawl.AudioLid input_manifest_file: ${workspace_dir}/manifest4.json output_manifest_file: ${workspace_dir}/manifest5.json input_audio_field: audios @@ -49,7 +49,7 @@ processors: device: cuda pretrained_model: "langid_ambernet" - - _target_: sdp.processors.datasets.cc.cc.SplitByVtt + - _target_: sdp.processors.datasets.commoncrawl.SplitByVtt input_manifest_file: ${workspace_dir}/manifest5.json output_manifest_file: ${workspace_dir}/manifest6.json splited_audio_dir: ${workspace_dir}/splited/ @@ -67,7 +67,7 @@ processors: high_duration_threshold: 40 low_duration_threshold: 0.2 - - _target_: sdp.processors.datasets.cc.cc.ReadParquet + - _target_: sdp.processors.datasets.commoncrawl.ReadParquet raw_data_dir: /mnt/md0/common_crawl/output/video_output2 output_manifest_file: ${workspace_dir}/manifest8.json output_video_field: video diff --git a/dataset_configs/commoncrawl/big_de.yaml b/dataset_configs/commoncrawl/big_de.yaml index b09207cb..ec8bdde6 100644 --- a/dataset_configs/commoncrawl/big_de.yaml +++ b/dataset_configs/commoncrawl/big_de.yaml @@ -2,13 +2,13 @@ processors_to_run: "0:" # ü ä ö ß Ä Ö Ü workspace_dir: /mnt/md0/common_crawl/cc_sdp/de processors: - - _target_: sdp.processors.datasets.cc.cc.PreserveByValue - input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8ps.json + - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue + input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8.json output_manifest_file: ${workspace_dir}/manifest0.json input_field: audio_lang target_value: de - - _target_: sdp.processors.datasets.cc.cc.PreserveByValue + - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue output_manifest_file: ${workspace_dir}/manifest1.json input_field: text_lang target_value: de @@ -64,7 +64,7 @@ processors: regex_patterns: - "^\\s*$" - - _target_: sdp.processors.datasets.cc.cc.Subprocess + - _target_: sdp.processors.datasets.commoncrawl.Subprocess output_manifest_file: ${workspace_dir}/manifest7.json input_manifest_arg: "--manifest" output_manifest_arg: "--output_filename" @@ -74,7 +74,7 @@ processors: --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/de/data/whitelist.tsv" # --overwrite_cache - # - _target_: sdp.processors.datasets.cc.cc.Subprocess + # - _target_: sdp.processors.datasets.commoncrawl.Subprocess # input_manifest_file: ${workspace_dir}/manifest6.json # output_manifest_file: ${workspace_dir}/manifest7.json # input_manifest_arg: "--input_file" @@ -96,7 +96,7 @@ processors: - {"pattern": '\\.{3}', "repl": '.'} - {"pattern": '\$', "repl": ""} - {"pattern": "'", "repl": " "} - - {"pattern": "[^a-zA-ZäöüÄÖÜß.,?]", "repl": " "} + - {"pattern": "[^a-zA-ZäöüÄÖÜßẞ.,?]", "repl": " "} - {"pattern": ' ', "repl": " "} - _target_: sdp.processors.DuplicateFields diff --git a/dataset_configs/commoncrawl/big_de_en.yaml b/dataset_configs/commoncrawl/big_de_en.yaml index 07d57983..eb429f45 100644 --- a/dataset_configs/commoncrawl/big_de_en.yaml +++ b/dataset_configs/commoncrawl/big_de_en.yaml @@ -2,13 +2,13 @@ processors_to_run: "14:" workspace_dir: /mnt/md0/common_crawl/cc_sdp/de_en processors: - - _target_: sdp.processors.datasets.cc.cc.PreserveByValue - input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8ps.json + - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue + input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8.json output_manifest_file: ${workspace_dir}/manifest0.json input_field: audio_lang target_value: de - - _target_: sdp.processors.datasets.cc.cc.PreserveByValue + - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue output_manifest_file: ${workspace_dir}/manifest1.json input_field: text_lang target_value: en @@ -32,7 +32,7 @@ processors: regex_patterns: - "^\\s*$" - - _target_: sdp.processors.datasets.cc.cc.NmtSubprocess + - _target_: sdp.processors.datasets.commoncrawl.NmtSubprocess output_manifest_file: ${workspace_dir}/manifest6.json arg_separator: "=" srctext_file: ${workspace_dir}/srctext.txt # --srctext=${workspace_dir}/srctext.txt @@ -76,7 +76,7 @@ processors: high_wordrate_threshold: 100 low_wordrate_threshold: 0.01 - - _target_: sdp.processors.datasets.cc.cc.Subprocess + - _target_: sdp.processors.datasets.commoncrawl.Subprocess output_manifest_file: ${workspace_dir}/manifest9.json input_manifest_arg: "--manifest" output_manifest_arg: "--output_filename" @@ -112,13 +112,13 @@ processors: regex_patterns: - "^\\s*$" - - _target_: sdp.processors.datasets.cc.cc.BLEUScore + - _target_: sdp.processors.datasets.commoncrawl.BLEUScore output_manifest_file: ${workspace_dir}/manifest13.json ref_field: text hyp_field: pred_text output_field: bleu - - _target_: sdp.processors.datasets.cc.cc.UseSonar + - _target_: sdp.processors.datasets.commoncrawl.UseSonar output_manifest_file: ${workspace_dir}/manifest14.json input_text_field: text input_audio_field: audio_filepath @@ -129,13 +129,13 @@ processors: text_encoder_lang: eng_Latn batch_size: 64 - - _target_: sdp.processors.datasets.cc.cc.PreserveByValue + - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue output_manifest_file: ${workspace_dir}/manifest15s.json input_field: sonar_dist target_value: 0.1 operator: le - # - _target_: sdp.processors.datasets.cc.cc.PreserveByValue + # - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue # output_manifest_file: ${workspace_dir}/manifest15.json # input_field: bleu # target_value: 10 diff --git a/dataset_configs/commoncrawl/big_en.yaml b/dataset_configs/commoncrawl/big_en.yaml index ef737ef5..b2c7ddb6 100644 --- a/dataset_configs/commoncrawl/big_en.yaml +++ b/dataset_configs/commoncrawl/big_en.yaml @@ -2,13 +2,13 @@ processors_to_run: "3:" workspace_dir: /mnt/md0/common_crawl/cc_sdp/en processors: - - _target_: sdp.processors.datasets.cc.cc.PreserveByValue + - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8ps.json output_manifest_file: ${workspace_dir}/manifest0.json input_field: audio_lang target_value: en - - _target_: sdp.processors.datasets.cc.cc.PreserveByValue + - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue output_manifest_file: ${workspace_dir}/manifest1.json input_field: text_lang target_value: en @@ -106,7 +106,7 @@ processors: regex_patterns: - "^\\s*$" - - _target_: sdp.processors.datasets.cc.cc.Subprocess + - _target_: sdp.processors.datasets.commoncrawl.Subprocess output_manifest_file: ${workspace_dir}/manifest8.json input_manifest_arg: "--manifest" output_manifest_arg: "--output_filename" @@ -115,7 +115,7 @@ processors: --language=en --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache --overwrite_cache \ --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv" - # - _target_: sdp.processors.datasets.cc.cc.Subprocess + # - _target_: sdp.processors.datasets.commoncrawl.Subprocess # input_manifest_file: ${workspace_dir}/manifest6.json # output_manifest_file: ${workspace_dir}/manifest7.json # input_manifest_arg: "--input_file" diff --git a/dataset_configs/commoncrawl/big_en_de.yaml b/dataset_configs/commoncrawl/big_en_de.yaml index 341b1f69..a39dc84c 100644 --- a/dataset_configs/commoncrawl/big_en_de.yaml +++ b/dataset_configs/commoncrawl/big_en_de.yaml @@ -1,14 +1,14 @@ -processors_to_run: "15:" +processors_to_run: "0:" workspace_dir: /mnt/md0/common_crawl/cc_sdp/en_de processors: - - _target_: sdp.processors.datasets.cc.cc.PreserveByValue + - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8ps.json output_manifest_file: ${workspace_dir}/manifest0.json input_field: audio_lang target_value: en - - _target_: sdp.processors.datasets.cc.cc.PreserveByValue + - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue output_manifest_file: ${workspace_dir}/manifest1.json input_field: text_lang target_value: de @@ -32,7 +32,7 @@ processors: regex_patterns: - "^\\s*$" - - _target_: sdp.processors.datasets.cc.cc.NmtSubprocess + - _target_: sdp.processors.datasets.commoncrawl.NmtSubprocess output_manifest_file: ${workspace_dir}/manifest6.json arg_separator: "=" srctext_file: ${workspace_dir}/srctext.txt # --srctext=${workspace_dir}/srctext.txt @@ -78,7 +78,7 @@ processors: high_wordrate_threshold: 100 low_wordrate_threshold: 0.01 - - _target_: sdp.processors.datasets.cc.cc.Subprocess + - _target_: sdp.processors.datasets.commoncrawl.Subprocess output_manifest_file: ${workspace_dir}/manifest9.json input_manifest_arg: "--manifest" output_manifest_arg: "--output_filename" @@ -107,13 +107,13 @@ processors: regex_patterns: - "^\\s*$" - - _target_: sdp.processors.datasets.cc.cc.BLEUScore + - _target_: sdp.processors.datasets.commoncrawl.BLEUScore output_manifest_file: ${workspace_dir}/manifest13.json ref_field: text hyp_field: pred_text output_field: bleu - - _target_: sdp.processors.datasets.cc.cc.UseSonar + - _target_: sdp.processors.datasets.commoncrawl.UseSonar output_manifest_file: ${workspace_dir}/manifest14.json input_text_field: text input_audio_field: audio_filepath @@ -124,7 +124,7 @@ processors: text_encoder_lang: deu_Latn batch_size: 64 - - _target_: sdp.processors.datasets.cc.cc.PreserveByValue + - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue output_manifest_file: ${workspace_dir}/manifest15.json input_field: bleu target_value: 30 diff --git a/dataset_configs/commoncrawl/big_en_fr.yaml b/dataset_configs/commoncrawl/big_en_fr.yaml index d8476d27..441d665b 100644 --- a/dataset_configs/commoncrawl/big_en_fr.yaml +++ b/dataset_configs/commoncrawl/big_en_fr.yaml @@ -2,13 +2,13 @@ processors_to_run: "12:" workspace_dir: /mnt/md0/common_crawl/cc_sdp/en_fr processors: - - _target_: sdp.processors.datasets.cc.cc.PreserveByValue + - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8ps.json output_manifest_file: ${workspace_dir}/manifest0.json input_field: audio_lang target_value: en - - _target_: sdp.processors.datasets.cc.cc.PreserveByValue + - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue output_manifest_file: ${workspace_dir}/manifest1.json input_field: text_lang target_value: fr @@ -32,7 +32,7 @@ processors: regex_patterns: - "^\\s*$" - - _target_: sdp.processors.datasets.cc.cc.NmtSubprocess + - _target_: sdp.processors.datasets.commoncrawl.NmtSubprocess output_manifest_file: ${workspace_dir}/manifest6.json arg_separator: "=" srctext_file: ${workspace_dir}/srctext.txt # --srctext=${workspace_dir}/srctext.txt @@ -75,7 +75,7 @@ processors: high_wordrate_threshold: 100 low_wordrate_threshold: 0.01 - # - _target_: sdp.processors.datasets.cc.cc.Subprocess + # - _target_: sdp.processors.datasets.commoncrawl.Subprocess # # input_manifest_file: ${workspace_dir}/manifest7.json # output_manifest_file: ${workspace_dir}/manifest10.json # input_manifest_arg: "--manifest" @@ -98,13 +98,13 @@ processors: - {"pattern": ' ', "repl": " "} - - _target_: sdp.processors.datasets.cc.cc.BLEUScore + - _target_: sdp.processors.datasets.commoncrawl.BLEUScore output_manifest_file: ${workspace_dir}/manifest10.json ref_field: text hyp_field: pred_text output_field: bleu - - _target_: sdp.processors.datasets.cc.cc.UseSonar + - _target_: sdp.processors.datasets.commoncrawl.UseSonar output_manifest_file: ${workspace_dir}/manifest11.json input_text_field: text input_audio_field: audio_filepath @@ -115,7 +115,7 @@ processors: text_encoder_lang: fra_Latn batch_size: 64 - - _target_: sdp.processors.datasets.cc.cc.PreserveByValue + - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue output_manifest_file: ${workspace_dir}/manifest12.json input_field: bleu target_value: 30 diff --git a/dataset_configs/commoncrawl/big_es.yaml b/dataset_configs/commoncrawl/big_es.yaml index b148b857..bde6b513 100644 --- a/dataset_configs/commoncrawl/big_es.yaml +++ b/dataset_configs/commoncrawl/big_es.yaml @@ -29,12 +29,12 @@ processors: - {"pattern": '\[(.*?)\]', "repl": ' '} - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"} - {"pattern": 'î', "repl": "i"} - - {"pattern": 'ì', "repl": "í"} - - {"pattern": 'è', "repl": "é"} - - {"pattern": 'È', "repl": "É"} - - {"pattern": 'ù', "repl": "ú"} - - {"pattern": 'ò', "repl": "ó"} - - {"pattern": 'à', "repl": "á"} + - {"pattern": 'ì', "repl": "i"} + - {"pattern": 'è', "repl": "e"} + - {"pattern": 'È', "repl": "E"} + - {"pattern": 'ù', "repl": "u"} + - {"pattern": 'ò', "repl": "o"} + - {"pattern": 'à', "repl": "a"} - {"pattern": '‚', "repl": ","} - {"pattern": "’", "repl": "'"} - {"pattern": "[-–—]", "repl": " "} @@ -95,7 +95,7 @@ processors: - {"pattern": '!', "repl": '.'} - {"pattern": '\\.{3}', "repl": '.'} - {"pattern": '\$', "repl": ""} - - {"pattern": "[^a-zA-ZóÓáéÉíÍñÑúÚ'.,?¿]", "repl": " "} + - {"pattern": "[^a-zA-ZóÓáéÉíÍñÑúÚüÜ'.,?¿]", "repl": " "} - {"pattern": ' ', "repl": " "} test_cases: - {input: {text: "' jupiter and venus both shining in the golden rosy sky"}, output: {text: "jupiter and venus both shining in the golden rosy sky"}} diff --git a/dataset_configs/commoncrawl/big_fr.yaml b/dataset_configs/commoncrawl/big_fr.yaml index 898880bb..f2e55b59 100644 --- a/dataset_configs/commoncrawl/big_fr.yaml +++ b/dataset_configs/commoncrawl/big_fr.yaml @@ -90,7 +90,7 @@ processors: - {"pattern": "'{2,}", "repl": "'"} - {"pattern": '\\.{3}', "repl": '.'} - {"pattern": '\$', "repl": ""} - - {"pattern": "[^a-zA-ZàâçéèêëîïôûùüÿñæœÀÂÇÉÈÊËÎÏÔÛÙÜŸÑÆŒ.,?'-]", "repl": " "} + - {"pattern": "[^a-zA-ZàâçéèêëîïôûùüÿæœÀÂÇÉÈÊËÎÏÔÛÙÜŸÆŒ.,?'-]", "repl": " "} - {"pattern": ' ', "repl": " "} - _target_: sdp.processors.DuplicateFields diff --git a/dataset_configs/commoncrawl/big_fr_en.yaml b/dataset_configs/commoncrawl/big_fr_en.yaml index d8473315..d00548a8 100644 --- a/dataset_configs/commoncrawl/big_fr_en.yaml +++ b/dataset_configs/commoncrawl/big_fr_en.yaml @@ -101,7 +101,7 @@ processors: - {"pattern": '\\.{3}', "repl": '.'} - {"pattern": '!', "repl": '.'} - {"pattern": '\$', "repl": ""} - - {"pattern": "[^A-Za-zàâçéèêëîïôûùüÿñæœÀÂÇÉÈÊËÎÏÔÛÙÜŸÑÆŒ.,?'-]", "repl": " "} + - {"pattern": "[^A-Za-zàâçéèêëîïôûùüÿæœÀÂÇÉÈÊËÎÏÔÛÙÜŸÆŒ.,?'-]", "repl": " "} - {"pattern": ' ', "repl": " "} test_cases: - {input: {text: "' jupiter and venus both shining in the golden rosy sky"}, output: {text: "jupiter and venus both shining in the golden rosy sky"}} diff --git a/dataset_configs/commoncrawl/big_pl.yaml b/dataset_configs/commoncrawl/big_pl.yaml index 38211cc7..628e80c2 100644 --- a/dataset_configs/commoncrawl/big_pl.yaml +++ b/dataset_configs/commoncrawl/big_pl.yaml @@ -2,13 +2,13 @@ processors_to_run: "0:" workspace_dir: /mnt/md0/common_crawl/cc_sdp/pl processors: - - _target_: sdp.processors.datasets.cc.cc.PreserveByValue - input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8ps.json + - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue + input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8.json output_manifest_file: ${workspace_dir}/manifest0.json input_field: audio_lang target_value: pl - - _target_: sdp.processors.datasets.cc.cc.PreserveByValue + - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue output_manifest_file: ${workspace_dir}/manifest1.json input_field: text_lang target_value: pl @@ -70,7 +70,7 @@ processors: - {"pattern": "'{2,}", "repl": "'"} - {"pattern": '\\.{3}', "repl": '.'} - {"pattern": '\$', "repl": ""} - - {"pattern": "[^a-zA-ZĘꥹłŁćĆżŻśŚŃńóÓźŹ.,?]", "repl": " "} + - {"pattern": "[^a-pr-uwy-zA-PR-UWY-ZąćęłńóśźżĄĆĘŁŃÓŚŹŻ.,?]", "repl": " "} - {"pattern": ' ', "repl": " "} - _target_: sdp.processors.DuplicateFields diff --git a/dataset_configs/commoncrawl/big_sentence.yaml b/dataset_configs/commoncrawl/big_sentence.yaml index 6870d144..9dbc1926 100644 --- a/dataset_configs/commoncrawl/big_sentence.yaml +++ b/dataset_configs/commoncrawl/big_sentence.yaml @@ -3,7 +3,7 @@ workspace_dir: /mnt/md1/common_crawl/cc_sdp workspace_dir_s: /mnt/md0/common_crawl/cc_sdp processors: - - _target_: sdp.processors.datasets.cc.cc.CreateInitialManifestCC + - _target_: sdp.processors.datasets.commoncrawl.CreateInitialManifestCC raw_data_dir: /mnt/md0/common_crawl/output/video_output2 output_manifest_file: ${workspace_dir}/manifest0.json resampled_audio_dir: ${workspace_dir}/audio/ @@ -14,7 +14,7 @@ processors: key_field: "key" text_field: "texts" - - _target_: sdp.processors.datasets.cc.cc.TxtToVtt + - _target_: sdp.processors.datasets.commoncrawl.TxtToVtt input_manifest_file: ${workspace_dir}/manifest0.json output_manifest_file: ${workspace_dir}/manifest1.json vtt_files_dir: ${workspace_dir}/vtts/ @@ -22,38 +22,38 @@ processors: text_field: "texts" vtt_field: "vtt_filepath" - - _target_: sdp.processors.datasets.cc.cc.AllVttText + - _target_: sdp.processors.datasets.commoncrawl.AllVttText output_manifest_file: ${workspace_dir}/manifest2.json input_filepath_field: vtt_filepath output_text_field: vtt_text - - _target_: sdp.processors.datasets.cc.cc.TextLid + - _target_: sdp.processors.datasets.commoncrawl.TextLid output_manifest_file: ${workspace_dir}/manifest3.json input_text_field: vtt_text output_lang_field: text_lang device: cuda pretrained_model: "jb2k/bert-base-multilingual-cased-language-detection" - - _target_: sdp.processors.datasets.cc.cc.Lang2Iso + - _target_: sdp.processors.datasets.commoncrawl.Lang2Iso output_manifest_file: ${workspace_dir}/manifest4.json input_lang_field: text_lang output_lang_field: text_lang - - _target_: sdp.processors.datasets.cc.cc.AudioLid + - _target_: sdp.processors.datasets.commoncrawl.AudioLid output_manifest_file: ${workspace_dir}/manifest5.json input_audio_field: audios output_lang_field: audio_lang device: cuda pretrained_model: "langid_ambernet" - - _target_: sdp.processors.datasets.cc.cc.ReadParquet + - _target_: sdp.processors.datasets.commoncrawl.ReadParquet raw_data_dir: /mnt/md0/common_crawl/output/video_output2 output_manifest_file: ${workspace_dir_s}/manifest6.json output_video_field: video output_vtt_field: caption key_field: key - - _target_: sdp.processors.datasets.cc.cc.SplitByVttSentence + - _target_: sdp.processors.datasets.commoncrawl.SplitByVttSentence output_manifest_file: ${workspace_dir_s}/manifest7.json splited_audio_dir: ${workspace_dir_s}/splited/ source_audio_field: audios From c9614f845cbd1133ca6e5c545a415e02a6424d82 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Fri, 29 Sep 2023 10:47:20 -0700 Subject: [PATCH 010/115] FfmpegConvert Signed-off-by: Nikolay Karpov --- .../commoncrawl/small_sentence.yaml | 29 ++++------- .../datasets/commoncrawl/__init__.py | 2 +- .../datasets/commoncrawl/commoncrawl.py | 50 +++++++++++++++++-- 3 files changed, 59 insertions(+), 22 deletions(-) diff --git a/dataset_configs/commoncrawl/small_sentence.yaml b/dataset_configs/commoncrawl/small_sentence.yaml index 4f8ebda3..96298ebc 100644 --- a/dataset_configs/commoncrawl/small_sentence.yaml +++ b/dataset_configs/commoncrawl/small_sentence.yaml @@ -2,33 +2,29 @@ processors_to_run: "0:" workspace_dir: /mnt/ssd8/cc_sdp processors: - - _target_: sdp.processors.datasets.commoncrawl.CreateInitialManifestCC - raw_data_dir: /mnt/ssd8/cc_download/videos_and_transcripts_large_16_cores_output/video_output2 + - _target_: sdp.processors.datasets.commoncrawl.FfmpegConvert + input_manifest_file: ${workspace_dir}/manifest_urls.json output_manifest_file: ${workspace_dir}/manifest0.json resampled_audio_dir: ${workspace_dir}/audio/ target_samplerate: 16000 target_nchannels: 1 - audio_field: "audios" video_field: "videos" + audio_field: "audios" key_field: "key" - text_field: "texts" - _target_: sdp.processors.datasets.commoncrawl.TxtToVtt - input_manifest_file: ${workspace_dir}/manifest0.json output_manifest_file: ${workspace_dir}/manifest1.json vtt_files_dir: ${workspace_dir}/vtts/ key_field: "key" text_field: "texts" vtt_field: "vtt_filepath" - - _target_: sdp.processors.datasets.commoncrawl.AllVttText - input_manifest_file: ${workspace_dir}/manifest1.json + - _target_: sdp.processors.datasets.commoncrawl.AllVttText output_manifest_file: ${workspace_dir}/manifest2.json input_filepath_field: vtt_filepath output_text_field: vtt_text - _target_: sdp.processors.datasets.commoncrawl.TextLid - input_manifest_file: ${workspace_dir}/manifest2.json output_manifest_file: ${workspace_dir}/manifest3.json input_text_field: vtt_text output_lang_field: text_lang @@ -47,25 +43,22 @@ processors: device: cuda pretrained_model: "langid_ambernet" - - _target_: sdp.processors.datasets.commoncrawl.ReadParquet - raw_data_dir: /mnt/ssd8/cc_download/videos_and_transcripts_large_16_cores_output/video_output2 - output_manifest_file: ${workspace_dir}/manifest6.json - output_video_field: video - output_vtt_field: caption - key_field: key - - _target_: sdp.processors.datasets.commoncrawl.SplitByVttSentence - output_manifest_file: ${workspace_dir}/manifest7.json + output_manifest_file: ${workspace_dir}/manifest6a.json splited_audio_dir: ${workspace_dir}/splited_s/ source_audio_field: audios vtt_field: "vtt_filepath" target_audio_field: "audio_filepath" duration_field: "duration" text_field: "text" - proxy_fields: [audio_lang, text_lang, video, caption] + proxy_fields: [audio_lang, text_lang] duration_threshold: 10.0 - _target_: sdp.processors.DropHighLowDuration - output_manifest_file: ${workspace_dir}/manifest8.json + output_manifest_file: ${workspace_dir}/manifest7a.json high_duration_threshold: 40 low_duration_threshold: 0.02 + + - _target_: sdp.processors.KeepOnlySpecifiedFields + output_manifest_file: ${workspace_dir}/manifest8a.json + fields_to_keep: ["audio_filepath", "duration", "text", "audio_lang", "text_lang"] \ No newline at end of file diff --git a/sdp/processors/datasets/commoncrawl/__init__.py b/sdp/processors/datasets/commoncrawl/__init__.py index e1c87620..33e5fbce 100644 --- a/sdp/processors/datasets/commoncrawl/__init__.py +++ b/sdp/processors/datasets/commoncrawl/__init__.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .commoncrawl import UseSonar, BLEUScore, Subprocess, NmtSubprocess, PreserveByValue, Lang2Iso, SplitByVttSentence, SplitByVtt, AudioLid, TextLid, AllVttText, TxtToVtt, ReadParquet, CreateInitialManifestCC +from .commoncrawl import UseSonar, BLEUScore, Subprocess, NmtSubprocess, PreserveByValue, Lang2Iso, SplitByVttSentence, SplitByVtt, AudioLid, TextLid, AllVttText, TxtToVtt, ReadParquet, CreateInitialManifestCC, FfmpegConvert diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py index d4791004..52b22b97 100644 --- a/sdp/processors/datasets/commoncrawl/commoncrawl.py +++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py @@ -339,8 +339,9 @@ def process_dataset_entry(self, data_entry): def makeDataEntry(self, data_entry, data, vtt_file, samplerate, text_c, start_c, end_c): data_sample = data[start_c:end_c] wav_save_file = os.path.join(self.splited_audio_dir, '/'.join(os.path.splitext(vtt_file)[0].split('/')[-2:]), str(int(start_c/(samplerate/1000)))+"-"+str(int(end_c/(samplerate/1000)))+".wav") - os.makedirs(os.path.split(wav_save_file)[0], exist_ok=True) - sf.write(wav_save_file, data_sample, samplerate) + if not os.path.isfile(wav_save_file): + os.makedirs(os.path.split(wav_save_file)[0], exist_ok=True) + sf.write(wav_save_file, data_sample, samplerate) data = {self.target_audio_field: wav_save_file, self.duration_field: data_sample.shape[0]/samplerate, @@ -663,9 +664,52 @@ def process_dataset_entry(self, data_entry): (video, key, text) = data_entry os.makedirs(os.path.join(self.resampled_audio_dir, key.split("/")[0]), exist_ok=True) audio = os.path.join(self.resampled_audio_dir, key) + ".wav" - ffmpeg_convert(video, audio, self.target_samplerate, self.target_nchannels) + if not os.path.isfile(audio): + ffmpeg_convert(video, audio, self.target_samplerate, self.target_nchannels) data = {self.audio_field: audio, + self.video_field: video, self.key_field: key, self.text_field: text} return [DataEntry(data=data)] + +class FfmpegConvert(BaseParallelProcessor): + """ + Args: + video_field (str): field with path to video file in the input manifest + audio_field (str): field with path to audio file in the output manifest + key_field (str): field with key value + resampled_audio_dir (str): where to put re-sampled and trimmed wav files. + target_samplerate (int): sample rate to resample to. Defaults to 16000. + target_nchannels (int): target number of channels. Defaults to 1. + """ + def __init__( + self, + resampled_audio_dir: str, + video_field: str, + audio_field: str, + key_field: str, + target_samplerate: int = 16000, + target_nchannels: int = 1, + **kwargs, + ): + super().__init__(**kwargs) + self.audio_field = audio_field + self.video_field = video_field + self.key_field = key_field + self.resampled_audio_dir = resampled_audio_dir + self.target_samplerate = target_samplerate + self.target_nchannels = target_nchannels + + def process_dataset_entry(self, data_entry): + video = data_entry[self.video_field] + key = os.path.splitext(data_entry[self.video_field])[0][-13:] + os.makedirs(os.path.join(self.resampled_audio_dir, key.split("/")[0]), exist_ok=True) + audio = os.path.join(self.resampled_audio_dir, key) + ".wav" + + if not os.path.isfile(audio): + ffmpeg_convert(video, audio, self.target_samplerate, self.target_nchannels) + + data_entry[self.audio_field]= audio + data_entry[self.key_field] = key + return [DataEntry(data=data_entry)] \ No newline at end of file From e9110704fdfeb7462a0232b65fd1716d175215d1 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Mon, 9 Oct 2023 03:27:35 -0700 Subject: [PATCH 011/115] ASR_HF Signed-off-by: Nikolay Karpov --- dataset_configs/commoncrawl/big_de.yaml | 4 +- dataset_configs/commoncrawl/big_en.yaml | 16 ++---- dataset_configs/commoncrawl/big_es.yaml | 6 ++- dataset_configs/commoncrawl/big_fr.yaml | 6 ++- dataset_configs/commoncrawl/big_pl.yaml | 4 +- dataset_configs/commoncrawl/big_sentence.yaml | 39 ++++++-------- .../datasets/commoncrawl/__init__.py | 4 +- .../datasets/commoncrawl/commoncrawl.py | 54 +++++++++++++++++-- .../datasets/commoncrawl/requirements.txt | 2 + 9 files changed, 90 insertions(+), 45 deletions(-) diff --git a/dataset_configs/commoncrawl/big_de.yaml b/dataset_configs/commoncrawl/big_de.yaml index ec8bdde6..711d0849 100644 --- a/dataset_configs/commoncrawl/big_de.yaml +++ b/dataset_configs/commoncrawl/big_de.yaml @@ -93,7 +93,9 @@ processors: output_manifest_file: ${workspace_dir}/manifest9.json text_key: text regex_params_list: - - {"pattern": '\\.{3}', "repl": '.'} + - {"pattern": '\s(\\x[a-h][0-9]){1,}\s', "repl": ' '} + - {"pattern": '(\\x[a-h][0-9]){1,}', "repl": ''} + - {"pattern": '\.{3}', "repl": '.'} - {"pattern": '\$', "repl": ""} - {"pattern": "'", "repl": " "} - {"pattern": "[^a-zA-ZäöüÄÖÜßẞ.,?]", "repl": " "} diff --git a/dataset_configs/commoncrawl/big_en.yaml b/dataset_configs/commoncrawl/big_en.yaml index b2c7ddb6..8e9e31d0 100644 --- a/dataset_configs/commoncrawl/big_en.yaml +++ b/dataset_configs/commoncrawl/big_en.yaml @@ -1,4 +1,4 @@ -processors_to_run: "3:" +processors_to_run: "0:" workspace_dir: /mnt/md0/common_crawl/cc_sdp/en processors: @@ -115,16 +115,6 @@ processors: --language=en --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache --overwrite_cache \ --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv" - # - _target_: sdp.processors.datasets.commoncrawl.Subprocess - # input_manifest_file: ${workspace_dir}/manifest6.json - # output_manifest_file: ${workspace_dir}/manifest7.json - # input_manifest_arg: "--input_file" - # output_manifest_arg: "--output_file" - # arg_separator: "=" - # cmd: ""python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize.py \ - # --language=es --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache \ - # --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv" - - _target_: sdp.processors.RenameFields output_manifest_file: ${workspace_dir}/manifest9.json rename_fields: {"normalized":"text"} @@ -136,7 +126,9 @@ processors: - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"} - {"pattern": "^\\s*'*\\s*", "repl": ""} - {"pattern": "'{2,}", "repl": "'"} - - {"pattern": '\\.{3}', "repl": '.'} + - {"pattern": '\s(\\x[a-h][0-9]){1,}\s', "repl": ' '} + - {"pattern": '(\\x[a-h][0-9]){1,}', "repl": ''} + - {"pattern": '\.{3}', "repl": '.'} - {"pattern": '!', "repl": '.'} - {"pattern": '\$', "repl": ""} - {"pattern": "[^A-Za-z'.,?]", "repl": " "} diff --git a/dataset_configs/commoncrawl/big_es.yaml b/dataset_configs/commoncrawl/big_es.yaml index bde6b513..9786fff9 100644 --- a/dataset_configs/commoncrawl/big_es.yaml +++ b/dataset_configs/commoncrawl/big_es.yaml @@ -93,9 +93,11 @@ processors: - {"pattern": "^\\s*'*\\s*", "repl": ""} - {"pattern": "'{2,}", "repl": "'"} - {"pattern": '!', "repl": '.'} - - {"pattern": '\\.{3}', "repl": '.'} + - {"pattern": '\s(\\x[a-h][0-9]){1,}\s', "repl": ' '} + - {"pattern": '(\\x[a-h][0-9]){1,}', "repl": ''} + - {"pattern": '\.{3}', "repl": '.'} - {"pattern": '\$', "repl": ""} - - {"pattern": "[^a-zA-ZóÓáéÉíÍñÑúÚüÜ'.,?¿]", "repl": " "} + - {"pattern": "[^a-zA-ZóÓáÁéÉíÍñÑúÚüÜ'.,?¿]", "repl": " "} - {"pattern": ' ', "repl": " "} test_cases: - {input: {text: "' jupiter and venus both shining in the golden rosy sky"}, output: {text: "jupiter and venus both shining in the golden rosy sky"}} diff --git a/dataset_configs/commoncrawl/big_fr.yaml b/dataset_configs/commoncrawl/big_fr.yaml index f2e55b59..80a12856 100644 --- a/dataset_configs/commoncrawl/big_fr.yaml +++ b/dataset_configs/commoncrawl/big_fr.yaml @@ -1,4 +1,4 @@ -processors_to_run: "8:" +processors_to_run: "0:" workspace_dir: /mnt/md0/common_crawl/cc_sdp/fr processors: @@ -88,7 +88,9 @@ processors: - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"} - {"pattern": "^\\s*'*\\s*", "repl": ""} - {"pattern": "'{2,}", "repl": "'"} - - {"pattern": '\\.{3}', "repl": '.'} + - {"pattern": '\s(\\x[a-h][0-9]){1,}\s', "repl": ' '} + - {"pattern": '(\\x[a-h][0-9]){1,}', "repl": ''} + - {"pattern": '\.{3}', "repl": '.'} - {"pattern": '\$', "repl": ""} - {"pattern": "[^a-zA-ZàâçéèêëîïôûùüÿæœÀÂÇÉÈÊËÎÏÔÛÙÜŸÆŒ.,?'-]", "repl": " "} - {"pattern": ' ', "repl": " "} diff --git a/dataset_configs/commoncrawl/big_pl.yaml b/dataset_configs/commoncrawl/big_pl.yaml index 628e80c2..a7e3a41b 100644 --- a/dataset_configs/commoncrawl/big_pl.yaml +++ b/dataset_configs/commoncrawl/big_pl.yaml @@ -68,7 +68,9 @@ processors: - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"} - {"pattern": "^\\s*'*\\s*", "repl": ""} - {"pattern": "'{2,}", "repl": "'"} - - {"pattern": '\\.{3}', "repl": '.'} + - {"pattern": '\s(\\x[a-h][0-9]){1,}\s', "repl": ' '} + - {"pattern": '(\\x[a-h][0-9]){1,}', "repl": ''} + - {"pattern": '\.{3}', "repl": '.'} - {"pattern": '\$', "repl": ""} - {"pattern": "[^a-pr-uwy-zA-PR-UWY-ZąćęłńóśźżĄĆĘŁŃÓŚŹŻ.,?]", "repl": " "} - {"pattern": ' ', "repl": " "} diff --git a/dataset_configs/commoncrawl/big_sentence.yaml b/dataset_configs/commoncrawl/big_sentence.yaml index 9dbc1926..c7eda2af 100644 --- a/dataset_configs/commoncrawl/big_sentence.yaml +++ b/dataset_configs/commoncrawl/big_sentence.yaml @@ -1,70 +1,65 @@ -processors_to_run: "7:" +processors_to_run: "0:" workspace_dir: /mnt/md1/common_crawl/cc_sdp workspace_dir_s: /mnt/md0/common_crawl/cc_sdp processors: - - _target_: sdp.processors.datasets.commoncrawl.CreateInitialManifestCC - raw_data_dir: /mnt/md0/common_crawl/output/video_output2 - output_manifest_file: ${workspace_dir}/manifest0.json + - _target_: sdp.processors.datasets.commoncrawl.FfmpegConvert + input_manifest_file: ${workspace_dir_s}/manifest_urls.json + output_manifest_file: ${workspace_dir_s}/manifest0.json resampled_audio_dir: ${workspace_dir}/audio/ target_samplerate: 16000 target_nchannels: 1 - audio_field: "audios" video_field: "videos" + audio_field: "audios" key_field: "key" - text_field: "texts" - _target_: sdp.processors.datasets.commoncrawl.TxtToVtt - input_manifest_file: ${workspace_dir}/manifest0.json - output_manifest_file: ${workspace_dir}/manifest1.json + output_manifest_file: ${workspace_dir_s}/manifest1.json vtt_files_dir: ${workspace_dir}/vtts/ key_field: "key" text_field: "texts" vtt_field: "vtt_filepath" - _target_: sdp.processors.datasets.commoncrawl.AllVttText - output_manifest_file: ${workspace_dir}/manifest2.json + output_manifest_file: ${workspace_dir_s}/manifest2.json input_filepath_field: vtt_filepath output_text_field: vtt_text - _target_: sdp.processors.datasets.commoncrawl.TextLid - output_manifest_file: ${workspace_dir}/manifest3.json + output_manifest_file: ${workspace_dir_s}/manifest3.json input_text_field: vtt_text output_lang_field: text_lang device: cuda pretrained_model: "jb2k/bert-base-multilingual-cased-language-detection" - _target_: sdp.processors.datasets.commoncrawl.Lang2Iso - output_manifest_file: ${workspace_dir}/manifest4.json + output_manifest_file: ${workspace_dir_s}/manifest4.json input_lang_field: text_lang output_lang_field: text_lang - _target_: sdp.processors.datasets.commoncrawl.AudioLid - output_manifest_file: ${workspace_dir}/manifest5.json + output_manifest_file: ${workspace_dir_s}/manifest5.json input_audio_field: audios output_lang_field: audio_lang device: cuda pretrained_model: "langid_ambernet" - - _target_: sdp.processors.datasets.commoncrawl.ReadParquet - raw_data_dir: /mnt/md0/common_crawl/output/video_output2 - output_manifest_file: ${workspace_dir_s}/manifest6.json - output_video_field: video - output_vtt_field: caption - key_field: key - - _target_: sdp.processors.datasets.commoncrawl.SplitByVttSentence - output_manifest_file: ${workspace_dir_s}/manifest7.json + output_manifest_file: ${workspace_dir_s}/manifest6.json splited_audio_dir: ${workspace_dir_s}/splited/ source_audio_field: audios target_audio_field: audio_filepath duration_field: duration text_field: text vtt_field: vtt_filepath - proxy_fields: [audio_lang, text_lang, video, caption] + proxy_fields: [audio_lang, text_lang] duration_threshold: 10.0 - _target_: sdp.processors.DropHighLowDuration - output_manifest_file: ${workspace_dir_s}/manifest8.json + output_manifest_file: ${workspace_dir_s}/manifest7.json high_duration_threshold: 40 low_duration_threshold: 0.02 + + - _target_: sdp.processors.KeepOnlySpecifiedFields + output_manifest_file: ${workspace_dir_s}/manifest8.json + fields_to_keep: ["audio_filepath", "duration", "text", "audio_lang", "text_lang"] diff --git a/sdp/processors/datasets/commoncrawl/__init__.py b/sdp/processors/datasets/commoncrawl/__init__.py index 33e5fbce..7ae86a58 100644 --- a/sdp/processors/datasets/commoncrawl/__init__.py +++ b/sdp/processors/datasets/commoncrawl/__init__.py @@ -12,4 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .commoncrawl import UseSonar, BLEUScore, Subprocess, NmtSubprocess, PreserveByValue, Lang2Iso, SplitByVttSentence, SplitByVtt, AudioLid, TextLid, AllVttText, TxtToVtt, ReadParquet, CreateInitialManifestCC, FfmpegConvert +from .commoncrawl import UseSonar, BLEUScore, Subprocess, NmtSubprocess, PreserveByValue, \ + Lang2Iso, SplitByVttSentence, SplitByVtt, AudioLid, TextLid, AllVttText, TxtToVtt, \ + ReadParquet, CreateInitialManifestCC, FfmpegConvert, ASR_HF diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py index 52b22b97..a0197707 100644 --- a/sdp/processors/datasets/commoncrawl/commoncrawl.py +++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py @@ -14,6 +14,53 @@ from sdp.processors.datasets.commoncrawl.harv_utils import ffmpeg_convert, txt2vtt, make_trans_list, get_vtt_text, text2lid, load_manifest, read_jsonl, write_jsonl, split_by_vtt_new from scipy.spatial import distance +class ASR_HF(BaseProcessor): + """ + Args: + resampled_audio_dir (str): where to put re-sampled and trimmed wav files. + """ + def __init__( + self, + pretrained_model: str, + output_text_field: str, + device: str = None, + batch_size: str = 1, + **kwargs, + ): + super().__init__(**kwargs) + self.pretrained_model = pretrained_model + self.output_text_field = output_text_field + self.device = device + self.batch_size = batch_size + + def process(self): + import torch + from huggingsound import SpeechRecognitionModel + + if self.device is None: + if torch.cuda.is_available(): + self.device = "cuda" + else: + self.device = "cpu" + + model = SpeechRecognitionModel(self.pretrained_model, + device = self.device, + letter_case = None) + + manifest, key_dict = load_manifest(Path(self.input_manifest_file), keys = ["audio_filepath"]) + audio_paths = key_dict["audio_filepath"] + + Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True) + + transcriptions = model.transcribe(paths = audio_paths, + batch_size = self.batch_size, + decoder=None) + + with Path(self.output_manifest_file).open('w') as f: + for item, transcription in tqdm(zip(manifest, transcriptions)): + item[self.output_text_field] = transcription["transcription"] + f.write(json.dumps(item, ensure_ascii=False) + '\n') + class UseSonar(BaseProcessor): """ Args: @@ -673,12 +720,11 @@ def process_dataset_entry(self, data_entry): self.text_field: text} return [DataEntry(data=data)] + class FfmpegConvert(BaseParallelProcessor): """ Args: - video_field (str): field with path to video file in the input manifest - audio_field (str): field with path to audio file in the output manifest - key_field (str): field with key value + raw_data_dir (str): where to put raw downloaded data. resampled_audio_dir (str): where to put re-sampled and trimmed wav files. target_samplerate (int): sample rate to resample to. Defaults to 16000. target_nchannels (int): target number of channels. Defaults to 1. @@ -686,8 +732,8 @@ class FfmpegConvert(BaseParallelProcessor): def __init__( self, resampled_audio_dir: str, - video_field: str, audio_field: str, + video_field: str, key_field: str, target_samplerate: int = 16000, target_nchannels: int = 1, diff --git a/sdp/processors/datasets/commoncrawl/requirements.txt b/sdp/processors/datasets/commoncrawl/requirements.txt index 39d03091..f0b24650 100644 --- a/sdp/processors/datasets/commoncrawl/requirements.txt +++ b/sdp/processors/datasets/commoncrawl/requirements.txt @@ -5,3 +5,5 @@ fastparquet pysndfile # conda install -c conda-forge libsndfile==1.0.31 sonar-space fairseq2 +huggingsound +pyarrow==12.0.1 \ No newline at end of file From 1097672951393559e2e3c3249d2a06d402c9ef80 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Mon, 9 Oct 2023 03:29:04 -0700 Subject: [PATCH 012/115] args Signed-off-by: Nikolay Karpov --- sdp/processors/datasets/commoncrawl/commoncrawl.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py index a0197707..d5992d73 100644 --- a/sdp/processors/datasets/commoncrawl/commoncrawl.py +++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py @@ -724,7 +724,9 @@ def process_dataset_entry(self, data_entry): class FfmpegConvert(BaseParallelProcessor): """ Args: - raw_data_dir (str): where to put raw downloaded data. + video_field (str): field with path to video file in the input manifest + audio_field (str): field with path to audio file in the output manifest + key_field (str): field with key value resampled_audio_dir (str): where to put re-sampled and trimmed wav files. target_samplerate (int): sample rate to resample to. Defaults to 16000. target_nchannels (int): target number of channels. Defaults to 1. From d90cd6173538da5b28d3e2fec6cef16608851cb3 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Fri, 20 Oct 2023 10:20:30 -0700 Subject: [PATCH 013/115] duration_key Signed-off-by: Nikolay Karpov --- sdp/processors/modify_manifest/data_to_dropbool.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sdp/processors/modify_manifest/data_to_dropbool.py b/sdp/processors/modify_manifest/data_to_dropbool.py index f7b30e03..3a340ae1 100644 --- a/sdp/processors/modify_manifest/data_to_dropbool.py +++ b/sdp/processors/modify_manifest/data_to_dropbool.py @@ -166,17 +166,17 @@ class DropHighLowDuration(BaseParallelProcessor): """ def __init__( - self, high_duration_threshold: float, low_duration_threshold: float, text_key: str = "text", **kwargs, + self, high_duration_threshold: float, low_duration_threshold: float, duration_key: str = "duration", **kwargs, ): super().__init__(**kwargs) self.high_duration_threshold = high_duration_threshold self.low_duration_threshold = low_duration_threshold self.high_drop_counter = 0 self.low_drop_counter = 0 - self.text_key = text_key + self.duration_key = duration_key def process_dataset_entry(self, data_entry) -> List: - duration = data_entry["duration"] + duration = data_entry[self.duration_key] if duration > self.high_duration_threshold: return [DataEntry(data=None, metrics=(0, 1))] elif duration < self.low_duration_threshold: From d3973c8b8dea2365f670c1d5b83c5ca596c7eeae Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Fri, 20 Oct 2023 10:21:33 -0700 Subject: [PATCH 014/115] nfa Signed-off-by: Nikolay Karpov --- dataset_configs/commoncrawl/small_en.yaml | 66 ++++++++- .../commoncrawl/small_sentence.yaml | 10 +- .../datasets/commoncrawl/__init__.py | 3 +- .../datasets/commoncrawl/commoncrawl.py | 128 ++++++++++++++++++ 4 files changed, 200 insertions(+), 7 deletions(-) diff --git a/dataset_configs/commoncrawl/small_en.yaml b/dataset_configs/commoncrawl/small_en.yaml index 1922dfe0..fb558487 100644 --- a/dataset_configs/commoncrawl/small_en.yaml +++ b/dataset_configs/commoncrawl/small_en.yaml @@ -1,9 +1,9 @@ -processors_to_run: "3:" +processors_to_run: "0:" workspace_dir: /mnt/ssd8/cc_sdp/en processors: - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue - input_manifest_file: /mnt/ssd8/cc_sdp/manifest8ps.json + input_manifest_file: /mnt/ssd8/cc_sdp/manifest9a.json output_manifest_file: ${workspace_dir}/manifest0.json input_field: audio_lang target_value: en @@ -144,4 +144,64 @@ processors: text_key: text pred_text_key: pred_text cer_threshold: 30 - \ No newline at end of file + + - _target_: sdp.processors.datasets.commoncrawl.AlignerSubprocess + input_manifest_file: ${workspace_dir}/manifest16.json + output_manifest_file: ${workspace_dir}/manifest19.json + input_manifest_arg: "manifest_filepath" + output_field: "alignment" + cmd: "python3 /home/nkarpov/workspace/NeMo/tools/nemo_forced_aligner/align.py pretrained_name=stt_en_fastconformer_hybrid_large_pc \ + output_dir=${workspace_dir} batch_size=1 additional_segment_grouping_separator=|" + + - _target_: sdp.processors.datasets.commoncrawl.SplitByAligner + output_manifest_file: ${workspace_dir}/manifest20.json + splited_audio_dir: ${workspace_dir}/nfa + input_field: source_audio + output_field: nfa_filepath + + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest21.json + duplicate_fields: {"audio_filepath":"audio_filepath_base"} + + - _target_: sdp.processors.RenameFields + output_manifest_file: ${workspace_dir}/manifest22.json + rename_fields: {"nfa_filepath":"audio_filepath"} + + - _target_: sdp.processors.DropHighLowDuration + output_manifest_file: ${workspace_dir}/manifest23.json + high_duration_threshold: 40 + low_duration_threshold: 0.02 + duration_key: nfa_duration + + - _target_: sdp.processors.ASRInference + output_manifest_file: ${workspace_dir}/manifest24.json + pretrained_model: nvidia/stt_en_fastconformer_hybrid_large_pc + batch_size: 64 + + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest25.json + duplicate_fields: {"pred_text":"pred_text_pc"} + + - _target_: sdp.processors.SubMakeLowercase + output_manifest_file: ${workspace_dir}/manifest26.json + text_key: pred_text + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest27.json + text_key: pred_text + regex_params_list: + - {"pattern": "[\\?\\.]", "repl": " "} + - {"pattern": ",", "repl": " "} + - {"pattern": " ", "repl": " "} + + - _target_: sdp.processors.DropHighWER + output_manifest_file: ${workspace_dir}/manifest28.json + text_key: text + pred_text_key: pred_text + wer_threshold: 75 + + - _target_: sdp.processors.DropHighCER + output_manifest_file: ${workspace_dir}/manifest29.json + text_key: text + pred_text_key: pred_text + cer_threshold: 30 \ No newline at end of file diff --git a/dataset_configs/commoncrawl/small_sentence.yaml b/dataset_configs/commoncrawl/small_sentence.yaml index 96298ebc..4727c56d 100644 --- a/dataset_configs/commoncrawl/small_sentence.yaml +++ b/dataset_configs/commoncrawl/small_sentence.yaml @@ -51,7 +51,7 @@ processors: target_audio_field: "audio_filepath" duration_field: "duration" text_field: "text" - proxy_fields: [audio_lang, text_lang] + proxy_fields: [audio_lang, text_lang, audios] duration_threshold: 10.0 - _target_: sdp.processors.DropHighLowDuration @@ -59,6 +59,10 @@ processors: high_duration_threshold: 40 low_duration_threshold: 0.02 - - _target_: sdp.processors.KeepOnlySpecifiedFields + - _target_: sdp.processors.DuplicateFields output_manifest_file: ${workspace_dir}/manifest8a.json - fields_to_keep: ["audio_filepath", "duration", "text", "audio_lang", "text_lang"] \ No newline at end of file + duplicate_fields: {"audios": "source_audio"} + + - _target_: sdp.processors.KeepOnlySpecifiedFields + output_manifest_file: ${workspace_dir}/manifest9a.json + fields_to_keep: ["audio_filepath", "duration", "text", "audio_lang", "text_lang", "source_audio"] \ No newline at end of file diff --git a/sdp/processors/datasets/commoncrawl/__init__.py b/sdp/processors/datasets/commoncrawl/__init__.py index 7ae86a58..3fc1561f 100644 --- a/sdp/processors/datasets/commoncrawl/__init__.py +++ b/sdp/processors/datasets/commoncrawl/__init__.py @@ -14,4 +14,5 @@ from .commoncrawl import UseSonar, BLEUScore, Subprocess, NmtSubprocess, PreserveByValue, \ Lang2Iso, SplitByVttSentence, SplitByVtt, AudioLid, TextLid, AllVttText, TxtToVtt, \ - ReadParquet, CreateInitialManifestCC, FfmpegConvert, ASR_HF + ReadParquet, CreateInitialManifestCC, FfmpegConvert, ASR_HF, AlignerSubprocess, \ + GetOffsetDuration, SplitByAligner diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py index d5992d73..cd261caa 100644 --- a/sdp/processors/datasets/commoncrawl/commoncrawl.py +++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py @@ -1,4 +1,6 @@ import os +import re +import math import json import subprocess from tqdm import tqdm @@ -14,6 +16,70 @@ from sdp.processors.datasets.commoncrawl.harv_utils import ffmpeg_convert, txt2vtt, make_trans_list, get_vtt_text, text2lid, load_manifest, read_jsonl, write_jsonl, split_by_vtt_new from scipy.spatial import distance +class SplitByAligner(BaseParallelProcessor): + """ + Args: + resampled_audio_dir (str): where to put re-sampled and trimmed wav files. + """ + def __init__( + self, + input_field: str, + output_field: str, + splited_audio_dir: str, + **kwargs, + ): + super().__init__(**kwargs) + self.input_field = input_field + self.output_field = output_field + self.splited_audio_dir = splited_audio_dir + + def prepare(self): + os.makedirs(self.splited_audio_dir, exist_ok=True) + + def process_dataset_entry(self, data_entry): + audio_filepath = data_entry[self.input_field] + + # print(data_entry) + data, samplerate = sf.read(audio_filepath) + nfa_start = data_entry["nfa_start"] + nfa_duration = data_entry["nfa_duration"] + + if math.isnan(nfa_start) or math.isnan(nfa_duration) or math.isnan(samplerate): + print(audio_filepath, nfa_start, nfa_duration) + data_entry[self.output_field] = data_entry['audio_filepath'] + else: + start = int(nfa_start*samplerate) + duration = int(nfa_duration*samplerate) + + data_sample = data[start : start+duration] + + wav_save_file = os.path.join(self.splited_audio_dir, '/'.join(os.path.splitext(audio_filepath)[0].split('/')[-2:]), str(int(start*1000/samplerate))+"-"+str(int((start+duration)*1000/samplerate))+".wav") + if not os.path.isfile(wav_save_file): + os.makedirs(os.path.split(wav_save_file)[0], exist_ok=True) + sf.write(wav_save_file, data_sample, samplerate) + data_entry[self.output_field]=wav_save_file + return [DataEntry(data=data_entry)] + +class GetOffsetDuration(BaseParallelProcessor): + """ + Args: + resampled_audio_dir (str): where to put re-sampled and trimmed wav files. + """ + def __init__( + self, + input_field: str, + **kwargs, + ): + super().__init__(**kwargs) + self.input_field = input_field + + def process_dataset_entry(self, data_entry): + input_value = data_entry[self.input_field] + offset, duration = os.path.splitext(os.path.split(input_value)[1])[0].split("-") + data_entry["offset"] = int(offset)/1000 + # data_entry["duration"] = duration + return [DataEntry(data=data_entry)] + class ASR_HF(BaseProcessor): """ Args: @@ -264,6 +330,68 @@ def process(self): df1[self.output_field] = tgtout write_jsonl(df1, self.output_manifest_file) +class AlignerSubprocess(Subprocess): + """This processor performs ASR inference on each utterance of the input manifest. + + ASR predictions will be saved in the ``pred_text`` key. + + Args: + pretrained_model (str): the name of the pretrained NeMo ASR model + which will be used to do inference. + batch_size (int): the batch size to use for ASR inference. Defaults to 32. + + Returns: + The same data as in the input manifest with an additional field + ``pred_text`` containing ASR model's predictions. + """ + + def __init__( + self, + output_field: str, + **kwargs, + ): + super().__init__(**kwargs) + self.output_field = output_field + + def process(self): + df1 = read_jsonl(self.input_manifest_file) + pattern = re.compile("\s{2,}") + df1["text"] = df1["text"].apply(lambda x: pattern.sub(" ", x).strip()) + df1["source"] = df1["audio_filepath"].apply(lambda x: x.split("/")[-2]) + + df2 = pd.DataFrame(df1.groupby("source_audio").apply(lambda in_df: "|".join(in_df["text"].tolist())), columns=["text"]).reset_index() + df2['audio_filepath'] = df2['source_audio'] + df2['text_len'] = df2['text'].apply(len) + df2 = df2[df2['text_len']<100000] + + self.input_manifest_file = os.path.join(os.path.split(self.input_manifest_file)[0], 'tmp.json') + write_jsonl(df2[['audio_filepath', 'text']], self.input_manifest_file) + + super().process() + manifest_path, manifest_name = os.path.split(self.input_manifest_file) + manifest_name = os.path.splitext(manifest_name)[0] + aligner_path = os.path.join(manifest_path,manifest_name+"_with_output_file_paths.json") + df3 = read_jsonl(aligner_path) + pattern = re.compile("") + df4 = pd.DataFrame() + + for ctm_filepath in tqdm(df3["segments_level_ctm_filepath"]): + source = os.path.splitext(ctm_filepath)[0].split('/')[-1] + df6 = df1[df1["source"] == source].reset_index() + df5 = pd.read_csv(ctm_filepath, sep=' ', header=None, dtype={0:str}) + df5["text"] = df5[4].apply(lambda x: pattern.sub(" ", x)) + df5["nfa_start"] = df5[2] + df5["nfa_duration"] = df5[3] + if df5.shape[0] == df6.shape[0]: + df7 = df5[["nfa_start", "nfa_duration", "text"]].merge(df6, how="right") + else: + raise ValueError(ctm_filepath) + + df4 = pd.concat([df4, df7]) + + write_jsonl(df4, self.output_manifest_file) + + class PreserveByValue(BaseParallelProcessor): """ Args: From 6170682e2ed29351b2580de5cdf0133e1c31b013 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Fri, 20 Oct 2023 10:47:18 -0700 Subject: [PATCH 015/115] source_audio Signed-off-by: Nikolay Karpov --- dataset_configs/commoncrawl/big_sentence.yaml | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/dataset_configs/commoncrawl/big_sentence.yaml b/dataset_configs/commoncrawl/big_sentence.yaml index c7eda2af..a6e0d35e 100644 --- a/dataset_configs/commoncrawl/big_sentence.yaml +++ b/dataset_configs/commoncrawl/big_sentence.yaml @@ -15,7 +15,7 @@ processors: - _target_: sdp.processors.datasets.commoncrawl.TxtToVtt output_manifest_file: ${workspace_dir_s}/manifest1.json - vtt_files_dir: ${workspace_dir}/vtts/ + vtt_files_dir: ${workspace_dir_s}/vtts/ key_field: "key" text_field: "texts" vtt_field: "vtt_filepath" @@ -52,7 +52,7 @@ processors: duration_field: duration text_field: text vtt_field: vtt_filepath - proxy_fields: [audio_lang, text_lang] + proxy_fields: [audio_lang, text_lang, audios] duration_threshold: 10.0 - _target_: sdp.processors.DropHighLowDuration @@ -60,6 +60,10 @@ processors: high_duration_threshold: 40 low_duration_threshold: 0.02 + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir_s}/manifest8a.json + duplicate_fields: {"audios": "source_audio"} + - _target_: sdp.processors.KeepOnlySpecifiedFields - output_manifest_file: ${workspace_dir_s}/manifest8.json - fields_to_keep: ["audio_filepath", "duration", "text", "audio_lang", "text_lang"] + output_manifest_file: ${workspace_dir_s}/manifest9a.json + fields_to_keep: ["audio_filepath", "duration", "text", "audio_lang", "text_lang", "source_audio"] From bf5ada03c4899a83c8a2404c2629f94f1dc80c00 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Mon, 23 Oct 2023 10:26:34 -0700 Subject: [PATCH 016/115] dsalign Signed-off-by: Nikolay Karpov --- dataset_configs/commoncrawl/small_en.yaml | 41 +++++++++++- .../commoncrawl/small_sentence.yaml | 7 +- .../datasets/commoncrawl/__init__.py | 2 +- .../datasets/commoncrawl/commoncrawl.py | 66 +++++++++++++++++++ 4 files changed, 113 insertions(+), 3 deletions(-) diff --git a/dataset_configs/commoncrawl/small_en.yaml b/dataset_configs/commoncrawl/small_en.yaml index fb558487..289bff7b 100644 --- a/dataset_configs/commoncrawl/small_en.yaml +++ b/dataset_configs/commoncrawl/small_en.yaml @@ -204,4 +204,43 @@ processors: output_manifest_file: ${workspace_dir}/manifest29.json text_key: text pred_text_key: pred_text - cer_threshold: 30 \ No newline at end of file + cer_threshold: 30 + + + - _target_: sdp.processors.datasets.commoncrawl.JoinBy + input_manifest_file: ${workspace_dir}/manifest16.json + output_manifest_file: ${workspace_dir}/manifest30.json + input_field: source_audio + + + - _target_: sdp.processors.datasets.commoncrawl.Subprocess + output_manifest_file: ${workspace_dir}/manifest31.json + input_manifest_arg: "--data_manifest" + output_manifest_arg: "--out_manifest" + arg_separator: "=" + cmd: "python /home/nkarpov/workspace/NvLLMOps/nvllmops/stages/asr/data_segmentation/ds_align/ds_align.py \ + --splits_dir=/mnt/ssd8/cc_sdp/en/dsa \ + --stt-model-path=/home/nkarpov/ckpts/en/stt_en_conformer_ctc_large_1.1/stt_en_conformer_ctc_large.nemo \ + --stt-model-type=CTC \ + --min-audio-duration=2 \ + --max-audio-duration=40 \ + --asr-batch-size=32" + + - _target_: sdp.processors.DropIfRegexMatch + output_manifest_file: ${workspace_dir}/manifest32.json + text_key: text + regex_patterns: + - "^\\s*$" + + - _target_: sdp.processors.DropHighWER + output_manifest_file: ${workspace_dir}/manifest33.json + text_key: text + pred_text_key: text_asr_pred + wer_threshold: 75 + + - _target_: sdp.processors.DropHighCER + output_manifest_file: ${workspace_dir}/manifest34.json + text_key: text + pred_text_key: text_asr_pred + cer_threshold: 30 + \ No newline at end of file diff --git a/dataset_configs/commoncrawl/small_sentence.yaml b/dataset_configs/commoncrawl/small_sentence.yaml index 4727c56d..9a8d4223 100644 --- a/dataset_configs/commoncrawl/small_sentence.yaml +++ b/dataset_configs/commoncrawl/small_sentence.yaml @@ -65,4 +65,9 @@ processors: - _target_: sdp.processors.KeepOnlySpecifiedFields output_manifest_file: ${workspace_dir}/manifest9a.json - fields_to_keep: ["audio_filepath", "duration", "text", "audio_lang", "text_lang", "source_audio"] \ No newline at end of file + fields_to_keep: ["audio_filepath", "duration", "text", "audio_lang", "text_lang", "source_audio"] + + - _target_: sdp.processors.datasets.commoncrawl.EvalBandwidth + output_manifest_file: ${workspace_dir}/manifest10a.json + input_field: audio_filepath + output_field: bandwidth \ No newline at end of file diff --git a/sdp/processors/datasets/commoncrawl/__init__.py b/sdp/processors/datasets/commoncrawl/__init__.py index 3fc1561f..7848306e 100644 --- a/sdp/processors/datasets/commoncrawl/__init__.py +++ b/sdp/processors/datasets/commoncrawl/__init__.py @@ -15,4 +15,4 @@ from .commoncrawl import UseSonar, BLEUScore, Subprocess, NmtSubprocess, PreserveByValue, \ Lang2Iso, SplitByVttSentence, SplitByVtt, AudioLid, TextLid, AllVttText, TxtToVtt, \ ReadParquet, CreateInitialManifestCC, FfmpegConvert, ASR_HF, AlignerSubprocess, \ - GetOffsetDuration, SplitByAligner + GetOffsetDuration, SplitByAligner, JoinBy, EvalBandwidth diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py index cd261caa..34bf9519 100644 --- a/sdp/processors/datasets/commoncrawl/commoncrawl.py +++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py @@ -3,8 +3,10 @@ import math import json import subprocess +import librosa from tqdm import tqdm import pandas as pd +import numpy as np from typing import Dict, List, Union from pathlib import Path from operator import lt, le, eq, ne, ge, gt @@ -16,6 +18,70 @@ from sdp.processors.datasets.commoncrawl.harv_utils import ffmpeg_convert, txt2vtt, make_trans_list, get_vtt_text, text2lid, load_manifest, read_jsonl, write_jsonl, split_by_vtt_new from scipy.spatial import distance +class JoinBy(BaseProcessor): + """This processor performs ASR inference on each utterance of the input manifest. + + """ + + def __init__( + self, + input_field: str, + **kwargs, + ): + super().__init__(**kwargs) + self.input_field = input_field + + def process(self): + df1 = read_jsonl(self.input_manifest_file) + pattern = re.compile("\s{2,}") + df1["text"] = df1["text"].apply(lambda x: pattern.sub(" ", x).strip()) + # df1["source"] = df1["audio_filepath"].apply(lambda x: x.split("/")[-2]) + + df2 = pd.DataFrame(df1.groupby(self.input_field).apply(lambda in_df: " ".join(in_df["text"].tolist())), columns=["text"]).reset_index() + df2['audio_filepath'] = df2[self.input_field] + write_jsonl(df2[['audio_filepath', 'text']], self.output_manifest_file) + +class EvalBandwidth(BaseParallelProcessor): + """ + Args: + input_field (str): where to get path to wav file. + input_field (str): where to put to frequency bandwidth. + threshold (str): threshold to count frequency bandwidth. + """ + def __init__( + self, + input_field: str, + output_field: str, + threshold: int = -50, + **kwargs, + ): + super().__init__(**kwargs) + self.input_field = input_field + self.output_field = output_field + self.threshold = threshold + + def process_dataset_entry(self, data_entry): + audio_filepath = data_entry[self.input_field] + data, samplerate = sf.read(audio_filepath) + freqband = self.eval_bandwidth(data, samplerate, threshold=self.threshold) + data_entry[self.output_field]=freqband + return [DataEntry(data=data_entry)] + + def eval_bandwidth(self, signal, sr, threshold=-50): + time_stride = 0.01 + hop_length = int(sr * time_stride) + n_fft = 512 + spectrogram = np.mean( + np.abs(librosa.stft(y=signal, n_fft=n_fft, hop_length=hop_length, window='blackmanharris')) ** 2, axis=1 + ) + power_spectrum = librosa.power_to_db(S=spectrogram, ref=np.max, top_db=100) + freqband = 0 + for idx in range(len(power_spectrum) - 1, -1, -1): + if power_spectrum[idx] > threshold: + freqband = idx / n_fft * sr + break + return freqband + class SplitByAligner(BaseParallelProcessor): """ Args: From 075a08acc25899577e1d838cd594ee79a9db3435 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Thu, 26 Oct 2023 04:46:43 -0700 Subject: [PATCH 017/115] audio_duration Signed-off-by: Nikolay Karpov --- dataset_configs/commoncrawl/big_sentence.yaml | 4 +- .../commoncrawl/small_sentence.yaml | 4 +- .../datasets/commoncrawl/__init__.py | 2 +- .../datasets/commoncrawl/commoncrawl.py | 139 +++++++++++++----- 4 files changed, 104 insertions(+), 45 deletions(-) diff --git a/dataset_configs/commoncrawl/big_sentence.yaml b/dataset_configs/commoncrawl/big_sentence.yaml index a6e0d35e..99de08ae 100644 --- a/dataset_configs/commoncrawl/big_sentence.yaml +++ b/dataset_configs/commoncrawl/big_sentence.yaml @@ -9,8 +9,8 @@ processors: resampled_audio_dir: ${workspace_dir}/audio/ target_samplerate: 16000 target_nchannels: 1 - video_field: "videos" - audio_field: "audios" + input_field: "videos" + output_field: "audios" key_field: "key" - _target_: sdp.processors.datasets.commoncrawl.TxtToVtt diff --git a/dataset_configs/commoncrawl/small_sentence.yaml b/dataset_configs/commoncrawl/small_sentence.yaml index 9a8d4223..7fd5e5f3 100644 --- a/dataset_configs/commoncrawl/small_sentence.yaml +++ b/dataset_configs/commoncrawl/small_sentence.yaml @@ -8,8 +8,8 @@ processors: resampled_audio_dir: ${workspace_dir}/audio/ target_samplerate: 16000 target_nchannels: 1 - video_field: "videos" - audio_field: "audios" + input_field: "videos" + output_field: "audios" key_field: "key" - _target_: sdp.processors.datasets.commoncrawl.TxtToVtt diff --git a/sdp/processors/datasets/commoncrawl/__init__.py b/sdp/processors/datasets/commoncrawl/__init__.py index 7848306e..22ed086d 100644 --- a/sdp/processors/datasets/commoncrawl/__init__.py +++ b/sdp/processors/datasets/commoncrawl/__init__.py @@ -15,4 +15,4 @@ from .commoncrawl import UseSonar, BLEUScore, Subprocess, NmtSubprocess, PreserveByValue, \ Lang2Iso, SplitByVttSentence, SplitByVtt, AudioLid, TextLid, AllVttText, TxtToVtt, \ ReadParquet, CreateInitialManifestCC, FfmpegConvert, ASR_HF, AlignerSubprocess, \ - GetOffsetDuration, SplitByAligner, JoinBy, EvalBandwidth + SplitByAligner, JoinBy, EvalBandwidth, CreateInitialManifestExt, AudioDuration diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py index 34bf9519..6798d074 100644 --- a/sdp/processors/datasets/commoncrawl/commoncrawl.py +++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py @@ -15,11 +15,13 @@ from sdp.processors.base_processor import BaseProcessor, BaseParallelProcessor, DataEntry from sdp.logging import logger -from sdp.processors.datasets.commoncrawl.harv_utils import ffmpeg_convert, txt2vtt, make_trans_list, get_vtt_text, text2lid, load_manifest, read_jsonl, write_jsonl, split_by_vtt_new +from sdp.processors.datasets.commoncrawl.harv_utils import ffmpeg_convert, txt2vtt, make_trans_list, get_vtt_text, text2lid, load_manifest, read_jsonl, write_jsonl, split_by_vtt_new, audio_duration from scipy.spatial import distance class JoinBy(BaseProcessor): - """This processor performs ASR inference on each utterance of the input manifest. + """ + This processor join several lines into one + input_field (str): where to get path to wav file. """ @@ -41,12 +43,33 @@ def process(self): df2['audio_filepath'] = df2[self.input_field] write_jsonl(df2[['audio_filepath', 'text']], self.output_manifest_file) +class AudioDuration(BaseParallelProcessor): + """ + Args: + input_field (str): where to get path to wav file. + output_field (str): where to put to frequency bandwidth. + """ + def __init__( + self, + input_field: str, + output_field: str, + **kwargs, + ): + super().__init__(**kwargs) + self.input_field = input_field + self.output_field = output_field + + def process_dataset_entry(self, data_entry): + audio_filepath = data_entry[self.input_field] + data_entry[self.output_field]=audio_duration(audio_filepath) + return [DataEntry(data=data_entry)] + class EvalBandwidth(BaseParallelProcessor): """ Args: input_field (str): where to get path to wav file. - input_field (str): where to put to frequency bandwidth. - threshold (str): threshold to count frequency bandwidth. + output_field (str): where to put to frequency bandwidth. + threshold (str): power threshold (in dB relative to peak power in spectrum bin) to estimate frequency bandwidth. """ def __init__( self, @@ -84,8 +107,12 @@ def eval_bandwidth(self, signal, sr, threshold=-50): class SplitByAligner(BaseParallelProcessor): """ + split wav file using NFA aligner fields: nfa_start, nfa_duration + Args: - resampled_audio_dir (str): where to put re-sampled and trimmed wav files. + input_field (str): field to get source wav file names. + output_field: (str): field to put splited wav file names. + splited_audio_dir (str): where to save splited wav files. """ def __init__( self, @@ -126,30 +153,14 @@ def process_dataset_entry(self, data_entry): data_entry[self.output_field]=wav_save_file return [DataEntry(data=data_entry)] -class GetOffsetDuration(BaseParallelProcessor): - """ - Args: - resampled_audio_dir (str): where to put re-sampled and trimmed wav files. - """ - def __init__( - self, - input_field: str, - **kwargs, - ): - super().__init__(**kwargs) - self.input_field = input_field - - def process_dataset_entry(self, data_entry): - input_value = data_entry[self.input_field] - offset, duration = os.path.splitext(os.path.split(input_value)[1])[0].split("-") - data_entry["offset"] = int(offset)/1000 - # data_entry["duration"] = duration - return [DataEntry(data=data_entry)] - class ASR_HF(BaseProcessor): """ + Transcribe usinf ASR model from HuggingFace. Args: - resampled_audio_dir (str): where to put re-sampled and trimmed wav files. + pretrained_model (str): name of pretrained model on HuggingFace. + output_text_field (str): field to save transcription result. + device (str): Inference device. + batch_size (str): Inference batch size. """ def __init__( self, @@ -195,8 +206,16 @@ def process(self): class UseSonar(BaseProcessor): """ + Count vector distance using Sonar library. Args: - resampled_audio_dir (str): where to put re-sampled and trimmed wav files. + input_text_field (str): field with text to process. + input_audio_field (str): field with audio file path to process. + output_field (str): field to save distance. + speech_encoder_model (str): name of pretrained speech encoder model. + text_encoder_lang (str): language of text. + text_encoder_model (str): name of pretrained text encoder model. + batch_size (int): batch size for inference. + device (str): device to inference on it. """ def __init__( self, @@ -279,8 +298,11 @@ def process_batch(self): class BLEUScore(BaseParallelProcessor): """ + Count BLEU Score Args: - resampled_audio_dir (str): where to put re-sampled and trimmed wav files. + ref_field (str): field with reference texts + hyp_field (str): field with hypotheses + output_field (str): field to save BLEU Score """ def __init__( self, @@ -305,7 +327,7 @@ def process_dataset_entry(self, data_entry): return [DataEntry(data=data_entry)] class Subprocess(BaseProcessor): - """This processor performs ASR inference on each utterance of the input manifest. + """This processor performs subprocess. ASR predictions will be saved in the ``pred_text`` key. @@ -397,9 +419,9 @@ def process(self): write_jsonl(df1, self.output_manifest_file) class AlignerSubprocess(Subprocess): - """This processor performs ASR inference on each utterance of the input manifest. + """This processor performs alignment of text on each audio file in the input manifest. - ASR predictions will be saved in the ``pred_text`` key. + Predictions will be saved in the ``output_field`` key. Args: pretrained_model (str): the name of the pretrained NeMo ASR model @@ -414,10 +436,12 @@ class AlignerSubprocess(Subprocess): def __init__( self, output_field: str, + duration_threshold: int = 5000, **kwargs, ): super().__init__(**kwargs) self.output_field = output_field + self.duration_threshold = duration_threshold def process(self): df1 = read_jsonl(self.input_manifest_file) @@ -427,8 +451,8 @@ def process(self): df2 = pd.DataFrame(df1.groupby("source_audio").apply(lambda in_df: "|".join(in_df["text"].tolist())), columns=["text"]).reset_index() df2['audio_filepath'] = df2['source_audio'] - df2['text_len'] = df2['text'].apply(len) - df2 = df2[df2['text_len']<100000] + df2['duration'] = df2['audio_filepath'].apply(audio_duration) + df2 = df2[df2['duration'] < self.duration_threshold] self.input_manifest_file = os.path.join(os.path.split(self.input_manifest_file)[0], 'tmp.json') write_jsonl(df2[['audio_filepath', 'text']], self.input_manifest_file) @@ -918,8 +942,8 @@ def process_dataset_entry(self, data_entry): class FfmpegConvert(BaseParallelProcessor): """ Args: - video_field (str): field with path to video file in the input manifest - audio_field (str): field with path to audio file in the output manifest + input_field (str): field with path to video file in the input manifest + output_field (str): field with path to audio file in the output manifest key_field (str): field with key value resampled_audio_dir (str): where to put re-sampled and trimmed wav files. target_samplerate (int): sample rate to resample to. Defaults to 16000. @@ -928,16 +952,16 @@ class FfmpegConvert(BaseParallelProcessor): def __init__( self, resampled_audio_dir: str, - audio_field: str, - video_field: str, + input_field: str, + output_field: str, key_field: str, target_samplerate: int = 16000, target_nchannels: int = 1, **kwargs, ): super().__init__(**kwargs) - self.audio_field = audio_field - self.video_field = video_field + self.audio_field = input_field + self.video_field = output_field self.key_field = key_field self.resampled_audio_dir = resampled_audio_dir self.target_samplerate = target_samplerate @@ -954,4 +978,39 @@ def process_dataset_entry(self, data_entry): data_entry[self.audio_field]= audio data_entry[self.key_field] = key - return [DataEntry(data=data_entry)] \ No newline at end of file + return [DataEntry(data=data_entry)] + + +class CreateInitialManifestExt(BaseParallelProcessor): + """ + Args: + raw_data_dir (str): where to put raw downloaded data. + resampled_audio_dir (str): where to put re-sampled and trimmed wav files. + target_samplerate (int): sample rate to resample to. Defaults to 16000. + target_nchannels (int): target number of channels. Defaults to 1. + """ + def __init__( + self, + raw_data_dir: str, + output_field: str = "audio_filepath", + extention: str = "mp3", + **kwargs, + ): + super().__init__(**kwargs) + self.raw_data_dir = Path(raw_data_dir) + self.output_field = output_field + self.extention = extention + + def prepare(self): + os.makedirs(self.raw_data_dir, exist_ok=True) + + def read_manifest(self): + input_files = [str(self.raw_data_dir / video) for video in self.raw_data_dir.rglob('*.' + self.extention)] + v_df = pd.DataFrame({self.output_field: input_files}) + return v_df.values + + def process_dataset_entry(self, data_entry): + (inputf) = data_entry + + data = {self.output_field: inputf[0]} + return [DataEntry(data=data)] \ No newline at end of file From e06560817915616673f64dd2dceacf307afe68a7 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Thu, 26 Oct 2023 05:03:01 -0700 Subject: [PATCH 018/115] EvalBandwidth and AlignerSubprocess Signed-off-by: Nikolay Karpov --- dataset_configs/commoncrawl/big_es.yaml | 66 ++++++++++++++++++- dataset_configs/commoncrawl/big_sentence.yaml | 6 ++ 2 files changed, 69 insertions(+), 3 deletions(-) diff --git a/dataset_configs/commoncrawl/big_es.yaml b/dataset_configs/commoncrawl/big_es.yaml index 9786fff9..e0035151 100644 --- a/dataset_configs/commoncrawl/big_es.yaml +++ b/dataset_configs/commoncrawl/big_es.yaml @@ -1,9 +1,9 @@ -processors_to_run: "4:" +processors_to_run: "0:" workspace_dir: /mnt/md0/common_crawl/cc_sdp/es processors: - _target_: sdp.processors.datasets.cc.cc.PreserveByValue - input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8ps.json + input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest9a.json output_manifest_file: ${workspace_dir}/manifest0.json input_field: audio_lang target_value: es @@ -154,4 +154,64 @@ processors: text_key: text pred_text_key: pred_text cer_threshold: 30 - \ No newline at end of file + + - _target_: sdp.processors.datasets.commoncrawl.AlignerSubprocess + input_manifest_file: ${workspace_dir}/manifest16.json + output_manifest_file: ${workspace_dir}/manifest19.json + input_manifest_arg: "manifest_filepath" + output_field: "alignment" + cmd: "python3 /home/nkarpov/workspace/NeMo/tools/nemo_forced_aligner/align.py pretrained_name=nvidia/stt_es_fastconformer_hybrid_large_pc \ + output_dir=${workspace_dir} batch_size=1 additional_segment_grouping_separator=|" + + - _target_: sdp.processors.datasets.commoncrawl.SplitByAligner + output_manifest_file: ${workspace_dir}/manifest20.json + splited_audio_dir: ${workspace_dir}/nfa + input_field: source_audio + output_field: nfa_filepath + + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest21.json + duplicate_fields: {"audio_filepath":"audio_filepath_base"} + + - _target_: sdp.processors.RenameFields + output_manifest_file: ${workspace_dir}/manifest22.json + rename_fields: {"nfa_filepath":"audio_filepath"} + + - _target_: sdp.processors.DropHighLowDuration + output_manifest_file: ${workspace_dir}/manifest23.json + high_duration_threshold: 40 + low_duration_threshold: 0.02 + duration_key: nfa_duration + + - _target_: sdp.processors.ASRInference + output_manifest_file: ${workspace_dir}/manifest24.json + pretrained_model: nvidia/stt_es_fastconformer_hybrid_large_pc + batch_size: 64 + + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest25.json + duplicate_fields: {"pred_text":"pred_text_pc"} + + - _target_: sdp.processors.SubMakeLowercase + output_manifest_file: ${workspace_dir}/manifest26.json + text_key: pred_text + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest27.json + text_key: pred_text + regex_params_list: + - {"pattern": "[\\?\\.]", "repl": " "} + - {"pattern": ",", "repl": " "} + - {"pattern": " ", "repl": " "} + + - _target_: sdp.processors.DropHighWER + output_manifest_file: ${workspace_dir}/manifest28.json + text_key: text + pred_text_key: pred_text + wer_threshold: 75 + + - _target_: sdp.processors.DropHighCER + output_manifest_file: ${workspace_dir}/manifest29.json + text_key: text + pred_text_key: pred_text + cer_threshold: 30 \ No newline at end of file diff --git a/dataset_configs/commoncrawl/big_sentence.yaml b/dataset_configs/commoncrawl/big_sentence.yaml index 99de08ae..ea541641 100644 --- a/dataset_configs/commoncrawl/big_sentence.yaml +++ b/dataset_configs/commoncrawl/big_sentence.yaml @@ -67,3 +67,9 @@ processors: - _target_: sdp.processors.KeepOnlySpecifiedFields output_manifest_file: ${workspace_dir_s}/manifest9a.json fields_to_keep: ["audio_filepath", "duration", "text", "audio_lang", "text_lang", "source_audio"] + + - _target_: sdp.processors.datasets.commoncrawl.EvalBandwidth + input_manifest_file: ${workspace_dir_s}/manifest5.json + output_manifest_file: ${workspace_dir_s}/manifest5a.json + input_field: audios + output_field: bandwidth \ No newline at end of file From dd9f2600a369a0213a1940d044e7693dc4bdfc4d Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Thu, 2 Nov 2023 01:20:31 -0700 Subject: [PATCH 019/115] split CreateInitialManifestCC Signed-off-by: Nikolay Karpov --- dataset_configs/commoncrawl/big.yaml | 98 ++++++++++++------- dataset_configs/commoncrawl/big_en.yaml | 9 +- dataset_configs/commoncrawl/small.yaml | 2 +- .../datasets/commoncrawl/commoncrawl.py | 14 ++- 4 files changed, 77 insertions(+), 46 deletions(-) diff --git a/dataset_configs/commoncrawl/big.yaml b/dataset_configs/commoncrawl/big.yaml index ba34839d..50fe70b0 100644 --- a/dataset_configs/commoncrawl/big.yaml +++ b/dataset_configs/commoncrawl/big.yaml @@ -1,75 +1,101 @@ processors_to_run: "0:" -workspace_dir: /mnt/md1/common_crawl/cc_sdp +workspace_dir: /mnt/md1/out +workspace_dir_s: /mnt/md0/out processors: - _target_: sdp.processors.datasets.commoncrawl.CreateInitialManifestCC - raw_data_dir: /mnt/md0/common_crawl/output/video_output2 + raw_data_dir: /mnt/md1/out/output_valid_captions output_manifest_file: ${workspace_dir}/manifest0.json - resampled_audio_dir: ${workspace_dir}/audio/ - target_samplerate: 16000 - target_nchannels: 1 - audio_field: "audios" video_field: "videos" key_field: "key" text_field: "texts" - - _target_: sdp.processors.datasets.commoncrawl.TxtToVtt - input_manifest_file: ${workspace_dir}/manifest0.json + - _target_: sdp.processors.datasets.commoncrawl.ReadParquet + raw_data_dir: /mnt/md1/out/output_valid_captions output_manifest_file: ${workspace_dir}/manifest1.json + output_video_field: video_url + output_caption_field: caption_url + key_field: key + + - _target_: sdp.processors.datasets.commoncrawl.FfmpegConvert + output_manifest_file: ${workspace_dir}/manifest2.json + resampled_audio_dir: ${workspace_dir_s}/audio + target_samplerate: 16000 + target_nchannels: 1 + input_field: "videos" + output_field: "audios" + key_field: "key" + + - _target_: sdp.processors.datasets.commoncrawl.AudioDuration + output_manifest_file: ${workspace_dir}/manifest3.json + input_field: audios + output_field: duration + + + - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue + output_manifest_file: ${workspace_dir}/manifest4.json + input_field: duration + target_value: 0 + operator: gt + + - _target_: sdp.processors.datasets.commoncrawl.TxtToVtt + output_manifest_file: ${workspace_dir}/manifest5.json vtt_files_dir: ${workspace_dir}/vtts/ key_field: "key" text_field: "texts" vtt_field: "vtt_filepath" - _target_: sdp.processors.datasets.commoncrawl.AllVttText - input_manifest_file: ${workspace_dir}/manifest1.json - output_manifest_file: ${workspace_dir}/manifest2.json + output_manifest_file: ${workspace_dir}/manifest6.json input_filepath_field: vtt_filepath output_text_field: vtt_text - _target_: sdp.processors.datasets.commoncrawl.TextLid - input_manifest_file: ${workspace_dir}/manifest2.json - output_manifest_file: ${workspace_dir}/manifest3.json + output_manifest_file: ${workspace_dir}/manifest7.json input_text_field: vtt_text output_lang_field: text_lang device: cuda pretrained_model: "jb2k/bert-base-multilingual-cased-language-detection" - _target_: sdp.processors.datasets.commoncrawl.Lang2Iso - input_manifest_file: ${workspace_dir}/manifest3.json - output_manifest_file: ${workspace_dir}/manifest4.json + output_manifest_file: ${workspace_dir}/manifest8.json input_lang_field: text_lang output_lang_field: text_lang - _target_: sdp.processors.datasets.commoncrawl.AudioLid - input_manifest_file: ${workspace_dir}/manifest4.json - output_manifest_file: ${workspace_dir}/manifest5.json + output_manifest_file: ${workspace_dir}/manifest9.json input_audio_field: audios output_lang_field: audio_lang device: cuda pretrained_model: "langid_ambernet" - - _target_: sdp.processors.datasets.commoncrawl.SplitByVtt - input_manifest_file: ${workspace_dir}/manifest5.json - output_manifest_file: ${workspace_dir}/manifest6.json - splited_audio_dir: ${workspace_dir}/splited/ + - _target_: sdp.processors.datasets.commoncrawl.SplitByVttSentence + output_manifest_file: ${workspace_dir}/manifest10.json + splited_audio_dir: ${workspace_dir}/splited source_audio_field: audios - audio_lang_field: audio_lang - text_lang_field: text_lang - key_field: "key" - target_audio_field: "audio_filepath" - duration_field: "duration" - text_field: "text" - vtt_field: "vtt_filepath" + target_audio_field: audio_filepath + duration_field: duration + text_field: text + vtt_field: vtt_filepath + proxy_fields: [audio_lang, text_lang, audios] + duration_threshold: 10.0 - _target_: sdp.processors.DropHighLowDuration - output_manifest_file: ${workspace_dir}/manifest7.json - high_duration_threshold: 40 - low_duration_threshold: 0.2 + output_manifest_file: ${workspace_dir}/manifest11.json + high_duration_threshold: 60 + low_duration_threshold: 0.01 - - _target_: sdp.processors.datasets.commoncrawl.ReadParquet - raw_data_dir: /mnt/md0/common_crawl/output/video_output2 - output_manifest_file: ${workspace_dir}/manifest8.json - output_video_field: video - output_vtt_field: caption - key_field: key + - _target_: sdp.processors.RenameFields + input_manifest_file: ${workspace_dir}/manifest9.json + output_manifest_file: ${workspace_dir}/manifest12.json + rename_fields: {"audios":"audio_filepath"} + + - _target_: sdp.processors.datasets.commoncrawl.Subprocess + output_manifest_file: ${workspace_dir}/manifest13.json + input_manifest_arg: "diarizer.manifest_filepath" + arg_separator: "=" + cmd: "python /home/nkarpov/workspace/NeMo/examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_infer.py \ + diarizer.out_dir=${workspace_dir}/diar \ + diarizer.speaker_embeddings.parameters.save_embeddings=False \ + diarizer.vad.model_path=/home/nkarpov/ckpts/diar/vad_multilingual_marblenet.nemo \ + diarizer.speaker_embeddings.model_path=/home/nkarpov/ckpts/diar/titanet-l.nemo" \ No newline at end of file diff --git a/dataset_configs/commoncrawl/big_en.yaml b/dataset_configs/commoncrawl/big_en.yaml index 8e9e31d0..1b4b7b03 100644 --- a/dataset_configs/commoncrawl/big_en.yaml +++ b/dataset_configs/commoncrawl/big_en.yaml @@ -1,9 +1,9 @@ processors_to_run: "0:" -workspace_dir: /mnt/md0/common_crawl/cc_sdp/en +workspace_dir: /mnt/md1/out/en #/mnt/md0/common_crawl/cc_sdp/en processors: - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue - input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8ps.json + input_manifest_file: /mnt/md1/out/manifest11.json output_manifest_file: ${workspace_dir}/manifest0.json input_field: audio_lang target_value: en @@ -24,8 +24,8 @@ processors: # - '://' # - "(\\s)+(www)\\.[a-zA-Z0-9/]+(\\s|$)+" # - '\\x' - - "www\\.wiki" - - "www\\.usgs\\." + - "www\\.wiki\\s" + - "www\\.usgs\\.\\s" # - 'é' # - 'ô' # - '×' @@ -69,6 +69,7 @@ processors: output_manifest_file: ${workspace_dir}/manifest5.json regex_params_list: - {"pattern": '\[(.*?)\]', "repl": ' '} + - {"pattern": '\((.*?)\)', "repl": ' '} - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"} - {"pattern": 'î', "repl": "i"} - {"pattern": 'ì', "repl": "i"} diff --git a/dataset_configs/commoncrawl/small.yaml b/dataset_configs/commoncrawl/small.yaml index d7a61254..c326188f 100644 --- a/dataset_configs/commoncrawl/small.yaml +++ b/dataset_configs/commoncrawl/small.yaml @@ -77,5 +77,5 @@ processors: raw_data_dir: /mnt/ssd8/cc_download/videos_and_transcripts_large_16_cores_output/video_output2 output_manifest_file: ${workspace_dir}/manifest8.json output_video_field: video - output_vtt_field: caption + output_caption_field: caption key_field: key \ No newline at end of file diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py index 6798d074..974363a5 100644 --- a/sdp/processors/datasets/commoncrawl/commoncrawl.py +++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py @@ -61,7 +61,11 @@ def __init__( def process_dataset_entry(self, data_entry): audio_filepath = data_entry[self.input_field] - data_entry[self.output_field]=audio_duration(audio_filepath) + try: + data_entry[self.output_field]=audio_duration(audio_filepath) + except Exception as e: + logger.warning(str(e) + " file: " + audio_filepath) + data_entry[self.output_field] = -1.0 return [DataEntry(data=data_entry)] class EvalBandwidth(BaseParallelProcessor): @@ -843,14 +847,14 @@ class ReadParquet(BaseParallelProcessor): def __init__( self, output_video_field: str, - output_vtt_field: str, + output_caption_field: str, key_field: str, raw_data_dir: str, **kwargs, ): super().__init__(**kwargs) self.output_video_field = output_video_field - self.output_vtt_field = output_vtt_field + self.output_caption_field = output_caption_field self.key_field = key_field self.raw_data_dir = Path(raw_data_dir) @@ -872,10 +876,10 @@ def process_dataset_entry(self, data_entry): key = key.split("/")[1] try: data_entry[self.output_video_field] = self.urls.loc[key]['url'] - data_entry[self.output_vtt_field] = self.urls.loc[key]['caption'] + data_entry[self.output_caption_field] = self.urls.loc[key]['caption'] except: data_entry[self.output_video_field] = "NN" - data_entry[self.output_vtt_field] = "NN" + data_entry[self.output_caption_field] = "NN" logger.warning("Key without URL or caption: " + key) return [DataEntry(data=data_entry)] From 1282ffbf4c878d0cfa0750977ee8bd142de78dcf Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Thu, 2 Nov 2023 01:21:16 -0700 Subject: [PATCH 020/115] split CreateInitialManifestCC Signed-off-by: Nikolay Karpov --- dataset_configs/commoncrawl/big_sentence.yaml | 66 ++++++++++++------- 1 file changed, 43 insertions(+), 23 deletions(-) diff --git a/dataset_configs/commoncrawl/big_sentence.yaml b/dataset_configs/commoncrawl/big_sentence.yaml index ea541641..59a1dc4f 100644 --- a/dataset_configs/commoncrawl/big_sentence.yaml +++ b/dataset_configs/commoncrawl/big_sentence.yaml @@ -3,73 +3,93 @@ workspace_dir: /mnt/md1/common_crawl/cc_sdp workspace_dir_s: /mnt/md0/common_crawl/cc_sdp processors: - - _target_: sdp.processors.datasets.commoncrawl.FfmpegConvert - input_manifest_file: ${workspace_dir_s}/manifest_urls.json + - _target_: sdp.processors.datasets.commoncrawl.CreateInitialManifestCC + raw_data_dir: /mnt/md0/common_crawl/output/video_output2 output_manifest_file: ${workspace_dir_s}/manifest0.json - resampled_audio_dir: ${workspace_dir}/audio/ + video_field: "source_video" + text_field: "texts" + key_field: "key" + + - _target_: sdp.processors.datasets.commoncrawl.ReadParquet + raw_data_dir: /mnt/md0/common_crawl/output/video_output2 + output_manifest_file: ${workspace_dir_s}/manifest1.json + output_video_field: video_url + output_caption_field: caption_url + key_field: key + + - _target_: sdp.processors.datasets.commoncrawl.FfmpegConvert + output_manifest_file: ${workspace_dir_s}/manifest2.json #${workspace_dir_s}/manifest_urls.json + resampled_audio_dir: ${workspace_dir_s}/audio target_samplerate: 16000 target_nchannels: 1 - input_field: "videos" - output_field: "audios" + input_field: "source_video" + output_field: "source_audio" key_field: "key" + - _target_: sdp.processors.datasets.commoncrawl.AudioDuration + output_manifest_file: ${workspace_dir_s}/manifest3.json + input_field: source_audio + output_field: duration + + - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue + output_manifest_file: ${workspace_dir_s}/manifest4.json + input_field: duration + target_value: 0 + operator: gt + - _target_: sdp.processors.datasets.commoncrawl.TxtToVtt - output_manifest_file: ${workspace_dir_s}/manifest1.json + output_manifest_file: ${workspace_dir_s}/manifest5.json vtt_files_dir: ${workspace_dir_s}/vtts/ key_field: "key" text_field: "texts" vtt_field: "vtt_filepath" - _target_: sdp.processors.datasets.commoncrawl.AllVttText - output_manifest_file: ${workspace_dir_s}/manifest2.json + output_manifest_file: ${workspace_dir_s}/manifest6.json input_filepath_field: vtt_filepath output_text_field: vtt_text - _target_: sdp.processors.datasets.commoncrawl.TextLid - output_manifest_file: ${workspace_dir_s}/manifest3.json + output_manifest_file: ${workspace_dir_s}/manifest7.json input_text_field: vtt_text output_lang_field: text_lang device: cuda pretrained_model: "jb2k/bert-base-multilingual-cased-language-detection" - _target_: sdp.processors.datasets.commoncrawl.Lang2Iso - output_manifest_file: ${workspace_dir_s}/manifest4.json + output_manifest_file: ${workspace_dir_s}/manifest8.json input_lang_field: text_lang output_lang_field: text_lang - _target_: sdp.processors.datasets.commoncrawl.AudioLid - output_manifest_file: ${workspace_dir_s}/manifest5.json - input_audio_field: audios + output_manifest_file: ${workspace_dir_s}/manifest9.json + input_audio_field: source_audio output_lang_field: audio_lang device: cuda pretrained_model: "langid_ambernet" - _target_: sdp.processors.datasets.commoncrawl.SplitByVttSentence - output_manifest_file: ${workspace_dir_s}/manifest6.json + output_manifest_file: ${workspace_dir_s}/manifest10.json splited_audio_dir: ${workspace_dir_s}/splited/ - source_audio_field: audios + source_audio_field: source_audio target_audio_field: audio_filepath duration_field: duration text_field: text vtt_field: vtt_filepath - proxy_fields: [audio_lang, text_lang, audios] + proxy_fields: [audio_lang, text_lang, source_audio] duration_threshold: 10.0 - _target_: sdp.processors.DropHighLowDuration - output_manifest_file: ${workspace_dir_s}/manifest7.json - high_duration_threshold: 40 - low_duration_threshold: 0.02 - - - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir_s}/manifest8a.json - duplicate_fields: {"audios": "source_audio"} + output_manifest_file: ${workspace_dir_s}/manifest11.json + high_duration_threshold: 60 + low_duration_threshold: 0.01 - _target_: sdp.processors.KeepOnlySpecifiedFields - output_manifest_file: ${workspace_dir_s}/manifest9a.json + output_manifest_file: ${workspace_dir_s}/manifest12.json fields_to_keep: ["audio_filepath", "duration", "text", "audio_lang", "text_lang", "source_audio"] - _target_: sdp.processors.datasets.commoncrawl.EvalBandwidth input_manifest_file: ${workspace_dir_s}/manifest5.json output_manifest_file: ${workspace_dir_s}/manifest5a.json - input_field: audios + input_field: source_audio output_field: bandwidth \ No newline at end of file From c1396adcbc729b71a00efa4528b64c6d032d5d81 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Thu, 2 Nov 2023 10:16:45 -0700 Subject: [PATCH 021/115] key_field Signed-off-by: Nikolay Karpov --- .../commoncrawl/small_sentence.yaml | 31 ++++++++++++++++--- .../datasets/commoncrawl/commoncrawl.py | 19 +++++++----- 2 files changed, 38 insertions(+), 12 deletions(-) diff --git a/dataset_configs/commoncrawl/small_sentence.yaml b/dataset_configs/commoncrawl/small_sentence.yaml index 7fd5e5f3..119bf3e7 100644 --- a/dataset_configs/commoncrawl/small_sentence.yaml +++ b/dataset_configs/commoncrawl/small_sentence.yaml @@ -2,16 +2,37 @@ processors_to_run: "0:" workspace_dir: /mnt/ssd8/cc_sdp processors: + - _target_: sdp.processors.datasets.commoncrawl.CreateInitialManifestCC + raw_data_dir: /mnt/ssd8/cc_download/videos_and_transcripts_large_16_cores_output/video_output2 + video_field: "source_video" + text_field: "texts" + key_field: "key" + + - _target_: sdp.processors.datasets.commoncrawl.ReadParquet + raw_data_dir: /mnt/ssd8/cc_download/videos_and_transcripts_large_16_cores_output/video_output2 + output_video_field: video_url + output_caption_field: caption_url + key_field: key + - _target_: sdp.processors.datasets.commoncrawl.FfmpegConvert - input_manifest_file: ${workspace_dir}/manifest_urls.json - output_manifest_file: ${workspace_dir}/manifest0.json - resampled_audio_dir: ${workspace_dir}/audio/ + # input_manifest_file:${workspace_dir}/manifest_urls.json + resampled_audio_dir: ${workspace_dir}/audio target_samplerate: 16000 target_nchannels: 1 - input_field: "videos" - output_field: "audios" + input_field: "source_video" + output_field: "source_audio" key_field: "key" + - _target_: sdp.processors.datasets.commoncrawl.AudioDuration + input_field: source_audio + output_field: duration + + - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue + output_manifest_file: ${workspace_dir}/manifest0.json + input_field: duration + target_value: 0 + operator: gt + - _target_: sdp.processors.datasets.commoncrawl.TxtToVtt output_manifest_file: ${workspace_dir}/manifest1.json vtt_files_dir: ${workspace_dir}/vtts/ diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py index 974363a5..8cf39c79 100644 --- a/sdp/processors/datasets/commoncrawl/commoncrawl.py +++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py @@ -18,6 +18,7 @@ from sdp.processors.datasets.commoncrawl.harv_utils import ffmpeg_convert, txt2vtt, make_trans_list, get_vtt_text, text2lid, load_manifest, read_jsonl, write_jsonl, split_by_vtt_new, audio_duration from scipy.spatial import distance + class JoinBy(BaseProcessor): """ This processor join several lines into one @@ -883,6 +884,10 @@ def process_dataset_entry(self, data_entry): logger.warning("Key without URL or caption: " + key) return [DataEntry(data=data_entry)] +def get_key(x): + key = "/".join(os.path.splitext(x)[0].split("/")[-2:]) + return key + class CreateInitialManifestCC(BaseParallelProcessor): """ Args: @@ -922,8 +927,8 @@ def read_manifest(self): texts = [str(self.raw_data_dir / text) for text in self.raw_data_dir.rglob('*.txt')] v_df = pd.DataFrame({self.video_field: videos}) t_df = pd.DataFrame({self.text_field: texts }) - v_df[self.key_field] = v_df[self.video_field].apply(lambda x: os.path.splitext(x)[0][-13:]) - t_df[self.key_field] = t_df[self.text_field].apply(lambda x: os.path.splitext(x)[0][-13:]) + v_df[self.key_field] = v_df[self.video_field].apply(get_key) + t_df[self.key_field] = t_df[self.text_field].apply(get_key) v_df = v_df.drop_duplicates(self.key_field) t_df = t_df.drop_duplicates(self.key_field) vt_df = v_df.merge(t_df, on=self.key_field, how="left") @@ -964,23 +969,23 @@ def __init__( **kwargs, ): super().__init__(**kwargs) - self.audio_field = input_field - self.video_field = output_field + self.input_field = input_field + self.output_field = output_field self.key_field = key_field self.resampled_audio_dir = resampled_audio_dir self.target_samplerate = target_samplerate self.target_nchannels = target_nchannels def process_dataset_entry(self, data_entry): - video = data_entry[self.video_field] - key = os.path.splitext(data_entry[self.video_field])[0][-13:] + video = data_entry[self.input_field] + key = data_entry[self.key_field] os.makedirs(os.path.join(self.resampled_audio_dir, key.split("/")[0]), exist_ok=True) audio = os.path.join(self.resampled_audio_dir, key) + ".wav" if not os.path.isfile(audio): ffmpeg_convert(video, audio, self.target_samplerate, self.target_nchannels) - data_entry[self.audio_field]= audio + data_entry[self.output_field]= audio data_entry[self.key_field] = key return [DataEntry(data=data_entry)] From fbee3801e9989aba1d4be718656ba96d7203cf11 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Tue, 7 Nov 2023 23:25:22 -0800 Subject: [PATCH 022/115] offline_diar_infer Signed-off-by: Nikolay Karpov --- .../commoncrawl/small_sentence.yaml | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/dataset_configs/commoncrawl/small_sentence.yaml b/dataset_configs/commoncrawl/small_sentence.yaml index 119bf3e7..a2429e86 100644 --- a/dataset_configs/commoncrawl/small_sentence.yaml +++ b/dataset_configs/commoncrawl/small_sentence.yaml @@ -1,5 +1,6 @@ processors_to_run: "0:" workspace_dir: /mnt/ssd8/cc_sdp +workspace_dir_diar: /mnt/ssd8/cc_sdp/diarize processors: - _target_: sdp.processors.datasets.commoncrawl.CreateInitialManifestCC @@ -91,4 +92,23 @@ processors: - _target_: sdp.processors.datasets.commoncrawl.EvalBandwidth output_manifest_file: ${workspace_dir}/manifest10a.json input_field: audio_filepath - output_field: bandwidth \ No newline at end of file + output_field: bandwidth + + - _target_: sdp.processors.RenameFields + input_manifest_file: ${workspace_dir}/manifest5.json + output_manifest_file: ${workspace_dir_diar}/manifest0.json + rename_fields: {"source_audio":"audio_filepath"} + + - _target_: sdp.processors.datasets.commoncrawl.Subprocess + input_manifest_arg: "diarizer.manifest_filepath" + arg_separator: "=" + cmd: "python /home/nkarpov/workspace/NeMo/examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_infer.py \ + --config-path=/home/nkarpov/workspace/NeMo/examples/speaker_tasks/diarization/conf/inference/ --config-name=diar_infer_general.yaml \ + diarizer.out_dir=${workspace_dir_diar} \ + diarizer.speaker_embeddings.parameters.save_embeddings=False \ + diarizer.vad.model_path=/home/nkarpov/ckpts/diar/vad_multilingual_marblenet.nemo \ + diarizer.speaker_embeddings.model_path=/home/nkarpov/ckpts/diar/titanet-l.nemo \ + diarizer.clustering.parameters.max_num_speakers=4 \ + diarizer.clustering.parameters.enhanced_count_thres=80 \ + diarizer.vad.parameters.onset=0.1 \ + diarizer.vad.parameters.offset=0.1 " \ No newline at end of file From 7fc4c1e9820947691d2e92fde040657ee38205d1 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Wed, 8 Nov 2023 02:29:29 -0800 Subject: [PATCH 023/115] arm Signed-off-by: Nikolay Karpov --- dataset_configs/armenian/config.yaml | 59 +++++++ sdp/processors/datasets/arm/__init__.py | 15 ++ sdp/processors/datasets/arm/armenian.py | 200 ++++++++++++++++++++++++ 3 files changed, 274 insertions(+) create mode 100644 dataset_configs/armenian/config.yaml create mode 100644 sdp/processors/datasets/arm/__init__.py create mode 100644 sdp/processors/datasets/arm/armenian.py diff --git a/dataset_configs/armenian/config.yaml b/dataset_configs/armenian/config.yaml new file mode 100644 index 00000000..43487b8d --- /dev/null +++ b/dataset_configs/armenian/config.yaml @@ -0,0 +1,59 @@ +processors_to_run: "0:" +workspace_dir: /mnt/ssd8/arm + +processors: + - _target_: sdp.processors.datasets.arm.CreateInitialManifestByExt + raw_data_dir: /mnt/ssd8/arm/mp3 + extention: mp3 + output_field: source_filepath + output_manifest_file: ${workspace_dir}/manifest0.json + + - _target_: sdp.processors.datasets.arm.FfmpegConvert + output_manifest_file: ${workspace_dir}/manifest1.json + resampled_audio_dir: ${workspace_dir}/audio + target_samplerate: 16000 + target_nchannels: 1 + input_field: "source_filepath" + output_field: "audio_filepath" + key_field: null + + - _target_: sdp.processors.datasets.arm.AudioDuration + input_field: audio_filepath + output_field: duration + output_manifest_file: ${workspace_dir}/manifest2.json + + - _target_: sdp.processors.datasets.arm.ASR_Whisper + output_manifest_file: ${workspace_dir}/manifest3.json + pretrained_model: "large-v2" + output_text_field: text + + - _target_: sdp.processors.SubMakeLowercase + output_manifest_file: ${workspace_dir}/manifest4.json + text_key: "text" + + - _target_: sdp.processors.DropNonAlphabet + output_manifest_file: ${workspace_dir}/manifest5.json + alphabet: "ԱԲԳԴԵԶԷԸԹԺԻԼԽԾԿՀՁՂՃՄՅՆՇՈՉՊՋՌՍՎՏՐՑՒՓՔՕՖՈՒԵ աբգդեզէըթժիլխծկհձղճմյնշոչպջռսվտրցւփքօֆուև.,!?" + test_cases: + - {input: {text: "test тест Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}, output: null} + - {input: {text: "Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}, output: {text: "Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}} + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest6.json + regex_params_list: + - {"pattern": '\[(.*?)\]', "repl": ' '} + - {"pattern": 'a', "repl": "ա"} + + - {"pattern": 'անտար', "repl": "անտառ"} + - {"pattern": 'թնակ', "repl": "տնակ"} + - {"pattern": 'Ռուսերենիս', "repl": "Ռուսերենից"} + - {"pattern": 'ամալիահ', "repl": "Ամալիյա"} + + - {"pattern": 'Էտկարպո', "repl": "Էդգար Պո"} + - {"pattern": 'թարգմանություն', "repl": "թարգմանությունը"} + - {"pattern": 'արտաշ է սեմինի', "repl": "Արտաշես Էմինի"} + # double space to single space + - {"pattern": " ", "repl": " "} + test_cases: + - {input: {text: "Գրիմ եղբայրներ, անտարի թնակը, Ռուսերենիս թարգմանեց, ամալիահ Ուկասյանը."}, output: {text: "Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}} + - {input: {text: "Էտկարպո, Մատնիչ սիրտը, թարգմանություն արտաշ է սեմինի."}, output: {text: "Էդգար Պո, Մատնիչ սիրտը, թարգմանությունը Արտաշես Էմինի."}} diff --git a/sdp/processors/datasets/arm/__init__.py b/sdp/processors/datasets/arm/__init__.py new file mode 100644 index 00000000..9f1dd5cc --- /dev/null +++ b/sdp/processors/datasets/arm/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .armenian import CreateInitialManifestByExt, FfmpegConvert, AudioDuration, ASR_Whisper diff --git a/sdp/processors/datasets/arm/armenian.py b/sdp/processors/datasets/arm/armenian.py new file mode 100644 index 00000000..eb536eb9 --- /dev/null +++ b/sdp/processors/datasets/arm/armenian.py @@ -0,0 +1,200 @@ +from sdp.processors.base_processor import BaseProcessor, BaseParallelProcessor, DataEntry +from sdp.logging import logger +import numpy as np +import os +import pandas as pd +from tqdm import tqdm +import json +from pathlib import Path +import soundfile as sf +import subprocess +from typing import Dict, List, Union + + +def load_manifest(manifest: Path) -> List[Dict[str, Union[str, float]]]: + result = [] + with manifest.open() as f: + for i, line in enumerate(f): + data = json.loads(line) + result.append(data) + return result + +class CreateInitialManifestByExt(BaseParallelProcessor): + """ + Args: + raw_data_dir (str): where to put raw downloaded data. + resampled_audio_dir (str): where to put re-sampled and trimmed wav files. + target_samplerate (int): sample rate to resample to. Defaults to 16000. + target_nchannels (int): target number of channels. Defaults to 1. + """ + def __init__( + self, + raw_data_dir: str, + output_field: str = "audio_filepath", + extention: str = "mp3", + **kwargs, + ): + super().__init__(**kwargs) + self.raw_data_dir = Path(raw_data_dir) + self.output_field = output_field + self.extention = extention + + def read_manifest(self): + input_files = [str(self.raw_data_dir / video) for video in self.raw_data_dir.rglob('*.' + self.extention)] + v_df = pd.DataFrame({self.output_field: input_files}) + return v_df.values + + def process_dataset_entry(self, data_entry): + (inputf) = data_entry + + data = {self.output_field: inputf[0]} + return [DataEntry(data=data)] + + +def ffmpeg_convert(jpg: str, wav: str, ar: int = 0, ac: int = 1): + process_args = ["ffmpeg", "-i", jpg, '-ac', str(ac), "-map", "0:a", "-c:a", "pcm_s16le", "-y", wav] + if ar: + process_args = process_args[:-1] + process_args.extend(["-ar", str(ar), wav]) + return subprocess.run(process_args, stdout = subprocess.DEVNULL, stderr = subprocess.DEVNULL) + +class FfmpegConvert(BaseParallelProcessor): + """ + Args: + input_field (str): field with path to video file in the input manifest + output_field (str): field with path to audio file in the output manifest + key_field (str): field with key value + resampled_audio_dir (str): where to put re-sampled and trimmed wav files. + target_samplerate (int): sample rate to resample to. Defaults to 16000. + target_nchannels (int): target number of channels. Defaults to 1. + """ + def __init__( + self, + resampled_audio_dir: str, + input_field: str, + output_field: str, + key_field: str = None, + target_samplerate: int = 16000, + target_nchannels: int = 1, + **kwargs, + ): + super().__init__(**kwargs) + self.input_field = input_field + self.output_field = output_field + self.key_field = key_field + self.resampled_audio_dir = resampled_audio_dir + self.target_samplerate = target_samplerate + self.target_nchannels = target_nchannels + + def prepare(self): + os.makedirs(os.path.split(self.output_manifest_file)[0], exist_ok=True) + os.makedirs(self.resampled_audio_dir, exist_ok=True) + + def process_dataset_entry(self, data_entry): + video = data_entry[self.input_field] + if self.key_field: + key = data_entry[self.key_field] + os.makedirs(os.path.join(self.resampled_audio_dir, key.split("/")[0]), exist_ok=True) + else: + key = os.path.splitext(video)[0].split("/")[-1] + audio = os.path.join(self.resampled_audio_dir, key) + ".wav" + + if not os.path.isfile(audio): + ffmpeg_convert(video, audio, self.target_samplerate, self.target_nchannels) + + data_entry[self.output_field]= audio + if self.key_field: + data_entry[self.key_field] = key + return [DataEntry(data=data_entry)] + + +class AudioDuration(BaseParallelProcessor): + """ + Args: + input_field (str): where to get path to wav file. + output_field (str): where to put to frequency bandwidth. + """ + def __init__( + self, + input_field: str, + output_field: str, + **kwargs, + ): + super().__init__(**kwargs) + self.input_field = input_field + self.output_field = output_field + + def process_dataset_entry(self, data_entry): + audio_filepath = data_entry[self.input_field] + try: + data, samplerate = sf.read(audio_filepath) + data_entry[self.output_field]=data.shape[0]/samplerate + except Exception as e: + logger.warning(str(e) + " file: " + audio_filepath) + data_entry[self.output_field] = -1.0 + return [DataEntry(data=data_entry)] + + +class ASR_Whisper(BaseProcessor): + """ + Transcribe usinf ASR model from HuggingFace. + Args: + pretrained_model (str): name of pretrained model on HuggingFace. + output_text_field (str): field to save transcription result. + device (str): Inference device. + batch_size (str): Inference batch size. + """ + def __init__( + self, + pretrained_model: str, + output_text_field: str, + device: str = None, + batch_size: str = 1, + **kwargs, + ): + super().__init__(**kwargs) + self.pretrained_model = pretrained_model + self.output_text_field = output_text_field + self.device = device + self.batch_size = batch_size + + def process(self): + import torch + import whisper # pip install -U openai-whisper + self.whisper = whisper + + if self.device is None: + if torch.cuda.is_available(): + self.device = "cuda" + else: + self.device = "cpu" + + self.model = self.whisper.load_model(self.pretrained_model) + + manifest = load_manifest(Path(self.input_manifest_file)) + + Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True) + + with Path(self.output_manifest_file).open('w') as f: + for item in tqdm(manifest): + text_hyp, lang = self.whisper_infer(item["audio_filepath"]) + # print(f"Detected language: {lang}") + item[self.output_text_field] = text_hyp + f.write(json.dumps(item, ensure_ascii=False) + '\n') + + def whisper_infer(self, audio_path): + audio = self.whisper.load_audio(audio_path) + + audio = self.whisper.pad_or_trim(audio) + mel = self.whisper.log_mel_spectrogram(audio) + mel = mel.to(self.device) + + _, probs = self.model.detect_language(mel) + lang = max(probs, key=probs.get) + + options = self.whisper.DecodingOptions() + result = self.whisper.decode(self.model, mel, options) + return result.text, lang + + + \ No newline at end of file From 7661d8ead83dfc0e01666f98570cd61f8d61d293 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Wed, 8 Nov 2023 22:44:12 -0800 Subject: [PATCH 024/115] duplicates Signed-off-by: Nikolay Karpov --- dataset_configs/commoncrawl/big.yaml | 5 +- dataset_configs/commoncrawl/big_de.yaml | 16 +-- dataset_configs/commoncrawl/big_en.yaml | 103 ++++++++++++++++++ dataset_configs/commoncrawl/big_es.yaml | 1 + dataset_configs/commoncrawl/big_fr.yaml | 9 +- dataset_configs/commoncrawl/big_pl.yaml | 5 +- dataset_configs/commoncrawl/big_sentence.yaml | 4 +- .../commoncrawl/small_sentence.yaml | 12 +- .../datasets/commoncrawl/commoncrawl.py | 38 +++---- 9 files changed, 141 insertions(+), 52 deletions(-) diff --git a/dataset_configs/commoncrawl/big.yaml b/dataset_configs/commoncrawl/big.yaml index 50fe70b0..12e9e9f2 100644 --- a/dataset_configs/commoncrawl/big.yaml +++ b/dataset_configs/commoncrawl/big.yaml @@ -1,5 +1,5 @@ processors_to_run: "0:" -workspace_dir: /mnt/md1/out +workspace_dir: /mnt/md1/out # /mnt/md1/common_crawl/cc_sdp workspace_dir_s: /mnt/md0/out processors: @@ -18,7 +18,7 @@ processors: key_field: key - _target_: sdp.processors.datasets.commoncrawl.FfmpegConvert - output_manifest_file: ${workspace_dir}/manifest2.json + output_manifest_file: ${workspace_dir}/manifest2.json #${workspace_dir_s}/manifest_urls.json resampled_audio_dir: ${workspace_dir_s}/audio target_samplerate: 16000 target_nchannels: 1 @@ -30,7 +30,6 @@ processors: output_manifest_file: ${workspace_dir}/manifest3.json input_field: audios output_field: duration - - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue output_manifest_file: ${workspace_dir}/manifest4.json diff --git a/dataset_configs/commoncrawl/big_de.yaml b/dataset_configs/commoncrawl/big_de.yaml index 711d0849..d1643b23 100644 --- a/dataset_configs/commoncrawl/big_de.yaml +++ b/dataset_configs/commoncrawl/big_de.yaml @@ -1,9 +1,9 @@ processors_to_run: "0:" # ü ä ö ß Ä Ö Ü -workspace_dir: /mnt/md0/common_crawl/cc_sdp/de +workspace_dir: /mnt/md1/out/de # /mnt/md0/common_crawl/cc_sdp/de processors: - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue - input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8.json + input_manifest_file: /mnt/md1/out/manifest11.json output_manifest_file: ${workspace_dir}/manifest0.json input_field: audio_lang target_value: de @@ -27,6 +27,7 @@ processors: text_key: text regex_params_list: - {"pattern": '\[(.*?)\]', "repl": ' '} + - {"pattern": '\((.*?)\)', "repl": ' '} - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"} - {"pattern": 'î', "repl": "i"} - {"pattern": 'ì', "repl": "i"} @@ -73,17 +74,6 @@ processors: --language=de --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache \ --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/de/data/whitelist.tsv" # --overwrite_cache - - # - _target_: sdp.processors.datasets.commoncrawl.Subprocess - # input_manifest_file: ${workspace_dir}/manifest6.json - # output_manifest_file: ${workspace_dir}/manifest7.json - # input_manifest_arg: "--input_file" - # output_manifest_arg: "--output_file" - # arg_separator: "=" - # cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize.py \ - # --language=de --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache --overwrite_cache \ - # --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/de/data/whitelist.tsv" - - _target_: sdp.processors.RenameFields output_manifest_file: ${workspace_dir}/manifest8.json diff --git a/dataset_configs/commoncrawl/big_en.yaml b/dataset_configs/commoncrawl/big_en.yaml index 1b4b7b03..c0dfd514 100644 --- a/dataset_configs/commoncrawl/big_en.yaml +++ b/dataset_configs/commoncrawl/big_en.yaml @@ -192,4 +192,107 @@ processors: text_key: text pred_text_key: pred_text cer_threshold: 30 + + - _target_: sdp.processors.RenameFields + input_manifest_file: ${workspace_dir}/manifest18.json + output_manifest_file: ${workspace_dir}/manifest21.json + rename_fields: {"audios":"source_audio"} + + - _target_: sdp.processors.datasets.commoncrawl.AlignerSubprocess + output_manifest_file: ${workspace_dir}/manifest22.json + input_manifest_arg: "manifest_filepath" + output_field: "alignment" + cmd: "python3 /home/nkarpov/workspace/NeMo/tools/nemo_forced_aligner/align.py pretrained_name=stt_en_fastconformer_hybrid_large_pc \ + output_dir=${workspace_dir} batch_size=1 additional_segment_grouping_separator=|" + + - _target_: sdp.processors.datasets.commoncrawl.SplitByAligner + output_manifest_file: ${workspace_dir}/manifest23.json + splited_audio_dir: ${workspace_dir}/nfa + input_field: source_audio + output_field: nfa_filepath + + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest24.json + duplicate_fields: {"audio_filepath":"audio_filepath_base"} + + - _target_: sdp.processors.RenameFields + output_manifest_file: ${workspace_dir}/manifest25.json + rename_fields: {"nfa_filepath":"audio_filepath"} + + - _target_: sdp.processors.DropHighLowDuration + output_manifest_file: ${workspace_dir}/manifest26.json + high_duration_threshold: 60 + low_duration_threshold: 0.01 + duration_key: nfa_duration + + - _target_: sdp.processors.ASRInference + output_manifest_file: ${workspace_dir}/manifest27.json + pretrained_model: nvidia/stt_en_fastconformer_hybrid_large_pc + batch_size: 64 + + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest28.json + duplicate_fields: {"pred_text":"pred_text_pc"} + + - _target_: sdp.processors.SubMakeLowercase + output_manifest_file: ${workspace_dir}/manifest29.json + text_key: pred_text + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest30.json + text_key: pred_text + regex_params_list: + - {"pattern": "[\\?\\.]", "repl": " "} + - {"pattern": ",", "repl": " "} + - {"pattern": " ", "repl": " "} + + - _target_: sdp.processors.DropHighWER + output_manifest_file: ${workspace_dir}/manifest31.json + text_key: text + pred_text_key: pred_text + wer_threshold: 75 + + - _target_: sdp.processors.DropHighCER + output_manifest_file: ${workspace_dir}/manifest32.json + text_key: text + pred_text_key: pred_text + cer_threshold: 30 + + + - _target_: sdp.processors.datasets.commoncrawl.JoinBy + input_manifest_file: ${workspace_dir}/manifest21.json + output_manifest_file: ${workspace_dir}/manifest33.json + input_field: source_audio + + + - _target_: sdp.processors.datasets.commoncrawl.Subprocess + output_manifest_file: ${workspace_dir}/manifest34.json + input_manifest_arg: "--data_manifest" + output_manifest_arg: "--out_manifest" + arg_separator: "=" + cmd: "python /home/nkarpov/workspace/NvLLMOps/nvllmops/stages/asr/data_segmentation/ds_align/ds_align.py \ + --splits_dir=/mnt/ssd8/cc_sdp/en/dsa \ + --stt-model-path=/home/nkarpov/ckpts/en/stt_en_conformer_ctc_large_1.1/stt_en_conformer_ctc_large.nemo \ + --stt-model-type=CTC \ + --min-audio-duration=2 \ + --max-audio-duration=40 \ + --asr-batch-size=32" + + - _target_: sdp.processors.DropIfRegexMatch + output_manifest_file: ${workspace_dir}/manifest35.json + text_key: text + regex_patterns: + - "^\\s*$" + + - _target_: sdp.processors.DropHighWER + output_manifest_file: ${workspace_dir}/manifest36.json + text_key: text + pred_text_key: text_asr_pred + wer_threshold: 75 + + - _target_: sdp.processors.DropHighCER + output_manifest_file: ${workspace_dir}/manifest37.json + text_key: text + pred_text_key: text_asr_pred + cer_threshold: 30 \ No newline at end of file diff --git a/dataset_configs/commoncrawl/big_es.yaml b/dataset_configs/commoncrawl/big_es.yaml index e0035151..a588b180 100644 --- a/dataset_configs/commoncrawl/big_es.yaml +++ b/dataset_configs/commoncrawl/big_es.yaml @@ -27,6 +27,7 @@ processors: text_key: text regex_params_list: - {"pattern": '\[(.*?)\]', "repl": ' '} + - {"pattern": '\((.*?)\)', "repl": ' '} - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"} - {"pattern": 'î', "repl": "i"} - {"pattern": 'ì', "repl": "i"} diff --git a/dataset_configs/commoncrawl/big_fr.yaml b/dataset_configs/commoncrawl/big_fr.yaml index 80a12856..5cf70f42 100644 --- a/dataset_configs/commoncrawl/big_fr.yaml +++ b/dataset_configs/commoncrawl/big_fr.yaml @@ -1,9 +1,9 @@ processors_to_run: "0:" -workspace_dir: /mnt/md0/common_crawl/cc_sdp/fr +workspace_dir: /mnt/md1/out/fr #/mnt/md0/common_crawl/cc_sdp/fr processors: - _target_: sdp.processors.datasets.cc.cc.PreserveByValue - input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8ps.json + input_manifest_file: /mnt/md1/out/manifest11.json output_manifest_file: ${workspace_dir}/manifest0.json input_field: audio_lang target_value: fr @@ -19,8 +19,8 @@ processors: - _target_: sdp.processors.ASRInference output_manifest_file: ${workspace_dir}/manifest3.json - pretrained_model: nvidia/stt_fr_conformer_transducer_large #stt_fr_fastconformer_hybrid_large_pc - batch_size: 64 + pretrained_model: nvidia/stt_fr_fastconformer_hybrid_large_pc + batch_size: 32 - _target_: sdp.processors.DropIfRegexMatch output_manifest_file: ${workspace_dir}/manifest4.json @@ -32,6 +32,7 @@ processors: text_key: text regex_params_list: - {"pattern": '\[(.*?)\]', "repl": ' '} + - {"pattern": '\((.*?)\)', "repl": ' '} - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"} - {"pattern": "\\\\x[a-f\\d]{1,}", "repl": " "} - {"pattern": '‚', "repl": ","} diff --git a/dataset_configs/commoncrawl/big_pl.yaml b/dataset_configs/commoncrawl/big_pl.yaml index a7e3a41b..29a590cb 100644 --- a/dataset_configs/commoncrawl/big_pl.yaml +++ b/dataset_configs/commoncrawl/big_pl.yaml @@ -1,9 +1,9 @@ processors_to_run: "0:" -workspace_dir: /mnt/md0/common_crawl/cc_sdp/pl +workspace_dir: /mnt/md1/out/pl #/mnt/md0/common_crawl/cc_sdp/pl processors: - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue - input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8.json + input_manifest_file: /mnt/md1/out/manifest11.json output_manifest_file: ${workspace_dir}/manifest0.json input_field: audio_lang target_value: pl @@ -27,6 +27,7 @@ processors: text_key: text regex_params_list: - {"pattern": '\[(.*?)\]', "repl": ' '} + - {"pattern": '\((.*?)\)', "repl": ' '} - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"} - {"pattern": '‚', "repl": ","} - {"pattern": "’", "repl": "'"} diff --git a/dataset_configs/commoncrawl/big_sentence.yaml b/dataset_configs/commoncrawl/big_sentence.yaml index 59a1dc4f..a31bcaea 100644 --- a/dataset_configs/commoncrawl/big_sentence.yaml +++ b/dataset_configs/commoncrawl/big_sentence.yaml @@ -19,7 +19,7 @@ processors: - _target_: sdp.processors.datasets.commoncrawl.FfmpegConvert output_manifest_file: ${workspace_dir_s}/manifest2.json #${workspace_dir_s}/manifest_urls.json - resampled_audio_dir: ${workspace_dir_s}/audio + resampled_audio_dir: ${workspace_dir}/audio target_samplerate: 16000 target_nchannels: 1 input_field: "source_video" @@ -39,7 +39,7 @@ processors: - _target_: sdp.processors.datasets.commoncrawl.TxtToVtt output_manifest_file: ${workspace_dir_s}/manifest5.json - vtt_files_dir: ${workspace_dir_s}/vtts/ + vtt_files_dir: ${workspace_dir_s}/vtts key_field: "key" text_field: "texts" vtt_field: "vtt_filepath" diff --git a/dataset_configs/commoncrawl/small_sentence.yaml b/dataset_configs/commoncrawl/small_sentence.yaml index a2429e86..70414221 100644 --- a/dataset_configs/commoncrawl/small_sentence.yaml +++ b/dataset_configs/commoncrawl/small_sentence.yaml @@ -4,12 +4,14 @@ workspace_dir_diar: /mnt/ssd8/cc_sdp/diarize processors: - _target_: sdp.processors.datasets.commoncrawl.CreateInitialManifestCC + output_manifest_file: ${workspace_dir}/manifest0s.json raw_data_dir: /mnt/ssd8/cc_download/videos_and_transcripts_large_16_cores_output/video_output2 video_field: "source_video" text_field: "texts" key_field: "key" - _target_: sdp.processors.datasets.commoncrawl.ReadParquet + output_manifest_file: ${workspace_dir}/manifest1s.json raw_data_dir: /mnt/ssd8/cc_download/videos_and_transcripts_large_16_cores_output/video_output2 output_video_field: video_url output_caption_field: caption_url @@ -17,6 +19,7 @@ processors: - _target_: sdp.processors.datasets.commoncrawl.FfmpegConvert # input_manifest_file:${workspace_dir}/manifest_urls.json + output_manifest_file: ${workspace_dir}/manifest2s.json resampled_audio_dir: ${workspace_dir}/audio target_samplerate: 16000 target_nchannels: 1 @@ -25,29 +28,30 @@ processors: key_field: "key" - _target_: sdp.processors.datasets.commoncrawl.AudioDuration + output_manifest_file: ${workspace_dir}/manifest3s.json input_field: source_audio output_field: duration - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue - output_manifest_file: ${workspace_dir}/manifest0.json + output_manifest_file: ${workspace_dir}/manifest4s.json input_field: duration target_value: 0 operator: gt - _target_: sdp.processors.datasets.commoncrawl.TxtToVtt - output_manifest_file: ${workspace_dir}/manifest1.json + output_manifest_file: ${workspace_dir}/manifest5s.json vtt_files_dir: ${workspace_dir}/vtts/ key_field: "key" text_field: "texts" vtt_field: "vtt_filepath" - _target_: sdp.processors.datasets.commoncrawl.AllVttText - output_manifest_file: ${workspace_dir}/manifest2.json + output_manifest_file: ${workspace_dir}/manifest6s.json input_filepath_field: vtt_filepath output_text_field: vtt_text - _target_: sdp.processors.datasets.commoncrawl.TextLid - output_manifest_file: ${workspace_dir}/manifest3.json + output_manifest_file: ${workspace_dir}/manifest7s.json input_text_field: vtt_text output_lang_field: text_lang device: cuda diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py index 8cf39c79..239a66f7 100644 --- a/sdp/processors/datasets/commoncrawl/commoncrawl.py +++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py @@ -760,18 +760,20 @@ def process(self): manifest = load_manifest(Path(self.input_manifest_file)) Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True) - + text_set = set() with Path(self.output_manifest_file).open('w') as f: for item in tqdm(manifest): text = item[self.input_text_field] - if text: - lid = text2lid(text_model, tokenizer, text) - else: - lid = None - - if lid: - item[self.output_lang_field] = lid - f.write(json.dumps(item, ensure_ascii=False) + '\n') + if text not in text_set: + text_set.add(text) + if text: + lid = text2lid(text_model, tokenizer, text) + else: + lid = None + + if lid: + item[self.output_lang_field] = lid + f.write(json.dumps(item, ensure_ascii=False) + '\n') class AllVttText(BaseParallelProcessor): """ @@ -899,34 +901,27 @@ class CreateInitialManifestCC(BaseParallelProcessor): def __init__( self, raw_data_dir: str, - resampled_audio_dir: str, - audio_field: str, video_field: str, key_field: str, text_field: str, - target_samplerate: int = 16000, - target_nchannels: int = 1, **kwargs, ): super().__init__(**kwargs) self.raw_data_dir = Path(raw_data_dir) - self.audio_field = audio_field self.video_field = video_field self.key_field = key_field self.text_field = text_field - self.resampled_audio_dir = resampled_audio_dir - self.target_samplerate = target_samplerate - self.target_nchannels = target_nchannels def prepare(self): os.makedirs(self.raw_data_dir, exist_ok=True) - os.makedirs(self.resampled_audio_dir, exist_ok=True) + def read_manifest(self): videos = [str(self.raw_data_dir / video) for video in self.raw_data_dir.rglob('*.jpg')] texts = [str(self.raw_data_dir / text) for text in self.raw_data_dir.rglob('*.txt')] v_df = pd.DataFrame({self.video_field: videos}) t_df = pd.DataFrame({self.text_field: texts }) + v_df[self.key_field] = v_df[self.video_field].apply(get_key) t_df[self.key_field] = t_df[self.text_field].apply(get_key) v_df = v_df.drop_duplicates(self.key_field) @@ -936,13 +931,8 @@ def read_manifest(self): def process_dataset_entry(self, data_entry): (video, key, text) = data_entry - os.makedirs(os.path.join(self.resampled_audio_dir, key.split("/")[0]), exist_ok=True) - audio = os.path.join(self.resampled_audio_dir, key) + ".wav" - if not os.path.isfile(audio): - ffmpeg_convert(video, audio, self.target_samplerate, self.target_nchannels) - data = {self.audio_field: audio, - self.video_field: video, + data = {self.video_field: video, self.key_field: key, self.text_field: text} return [DataEntry(data=data)] From 718a8122e6aca482a069df0a5178dd752c431d25 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Wed, 8 Nov 2023 22:53:56 -0800 Subject: [PATCH 025/115] drop_text_duplicates Signed-off-by: Nikolay Karpov --- dataset_configs/commoncrawl/big.yaml | 1 + dataset_configs/commoncrawl/big_sentence.yaml | 1 + dataset_configs/commoncrawl/small.yaml | 1 + dataset_configs/commoncrawl/small_sentence.yaml | 1 + sdp/processors/datasets/commoncrawl/commoncrawl.py | 4 +++- 5 files changed, 7 insertions(+), 1 deletion(-) diff --git a/dataset_configs/commoncrawl/big.yaml b/dataset_configs/commoncrawl/big.yaml index 12e9e9f2..44199a43 100644 --- a/dataset_configs/commoncrawl/big.yaml +++ b/dataset_configs/commoncrawl/big.yaml @@ -55,6 +55,7 @@ processors: output_lang_field: text_lang device: cuda pretrained_model: "jb2k/bert-base-multilingual-cased-language-detection" + drop_text_duplicates: True - _target_: sdp.processors.datasets.commoncrawl.Lang2Iso output_manifest_file: ${workspace_dir}/manifest8.json diff --git a/dataset_configs/commoncrawl/big_sentence.yaml b/dataset_configs/commoncrawl/big_sentence.yaml index a31bcaea..a930f770 100644 --- a/dataset_configs/commoncrawl/big_sentence.yaml +++ b/dataset_configs/commoncrawl/big_sentence.yaml @@ -55,6 +55,7 @@ processors: output_lang_field: text_lang device: cuda pretrained_model: "jb2k/bert-base-multilingual-cased-language-detection" + drop_text_duplicates: True - _target_: sdp.processors.datasets.commoncrawl.Lang2Iso output_manifest_file: ${workspace_dir_s}/manifest8.json diff --git a/dataset_configs/commoncrawl/small.yaml b/dataset_configs/commoncrawl/small.yaml index c326188f..be90de1b 100644 --- a/dataset_configs/commoncrawl/small.yaml +++ b/dataset_configs/commoncrawl/small.yaml @@ -36,6 +36,7 @@ processors: output_lang_field: text_lang device: cuda pretrained_model: "jb2k/bert-base-multilingual-cased-language-detection" + drop_text_duplicates: True - _target_: sdp.processors.datasets.commoncrawl.Lang2Iso input_manifest_file: ${workspace_dir}/manifest3.json diff --git a/dataset_configs/commoncrawl/small_sentence.yaml b/dataset_configs/commoncrawl/small_sentence.yaml index 70414221..2e311dd3 100644 --- a/dataset_configs/commoncrawl/small_sentence.yaml +++ b/dataset_configs/commoncrawl/small_sentence.yaml @@ -56,6 +56,7 @@ processors: output_lang_field: text_lang device: cuda pretrained_model: "jb2k/bert-base-multilingual-cased-language-detection" + drop_text_duplicates: True - _target_: sdp.processors.datasets.commoncrawl.Lang2Iso output_manifest_file: ${workspace_dir}/manifest4.json diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py index 239a66f7..f587d920 100644 --- a/sdp/processors/datasets/commoncrawl/commoncrawl.py +++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py @@ -734,6 +734,7 @@ def __init__( pretrained_model: str, output_lang_field: str, device: str, + drop_text_duplicates: bool = False, **kwargs, ): super().__init__(**kwargs) @@ -741,6 +742,7 @@ def __init__( self.pretrained_model = pretrained_model self.output_lang_field = output_lang_field self.device = device + self.drop_duplicates = drop_text_duplicates def process(self): import torch # importing after nemo to make sure users first install nemo, instead of torch, then nemo @@ -764,7 +766,7 @@ def process(self): with Path(self.output_manifest_file).open('w') as f: for item in tqdm(manifest): text = item[self.input_text_field] - if text not in text_set: + if self.drop_duplicates and text not in text_set: text_set.add(text) if text: lid = text2lid(text_model, tokenizer, text) From 8bfdfc94475865154aa3438cbb7765735e89bb93 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Thu, 9 Nov 2023 06:57:05 -0800 Subject: [PATCH 026/115] mcv Signed-off-by: Nikolay Karpov --- dataset_configs/armenian/mcv.yaml | 27 ++++++++++++++ sdp/processors/datasets/arm/armenian.py | 47 +++++++++++-------------- 2 files changed, 48 insertions(+), 26 deletions(-) create mode 100644 dataset_configs/armenian/mcv.yaml diff --git a/dataset_configs/armenian/mcv.yaml b/dataset_configs/armenian/mcv.yaml new file mode 100644 index 00000000..c865f91b --- /dev/null +++ b/dataset_configs/armenian/mcv.yaml @@ -0,0 +1,27 @@ +processors_to_run: "0:" +workspace_dir: /mnt/ssd8/arm/mcv + +processors: + - _target_: sdp.processors.CreateInitialManifestMCV + raw_data_dir: /home/nkarpov/data/hy + extract_archive_dir: /mnt/ssd8/arm/mcv/row + resampled_audio_dir: /mnt/ssd8/arm/mcv/16k + data_split: train + language_id: cv-corpus-15.0-2023-09-08-hy-AM + output_manifest_file: ${workspace_dir}/manifest0.json + + - _target_: sdp.processors.datasets.arm.ASR_Whisper + output_manifest_file: ${workspace_dir}/manifest1.json + pretrained_model: "large-v2" + output_text_field: pred_text + + - _target_: sdp.processors.DropHighWER + text_key: text + pred_text_key: pred_text + wer_threshold: 75 + + - _target_: sdp.processors.DropHighCER + output_manifest_file: ${workspace_dir}/manifest3.json + text_key: text + pred_text_key: pred_text + cer_threshold: 30 diff --git a/sdp/processors/datasets/arm/armenian.py b/sdp/processors/datasets/arm/armenian.py index eb536eb9..95745512 100644 --- a/sdp/processors/datasets/arm/armenian.py +++ b/sdp/processors/datasets/arm/armenian.py @@ -1,16 +1,18 @@ -from sdp.processors.base_processor import BaseProcessor, BaseParallelProcessor, DataEntry -from sdp.logging import logger -import numpy as np +import torch +import whisper # pip install -U openai-whisper import os +import json import pandas as pd from tqdm import tqdm -import json from pathlib import Path import soundfile as sf import subprocess from typing import Dict, List, Union +from sdp.processors.base_processor import BaseProcessor, BaseParallelProcessor, DataEntry +from sdp.logging import logger + def load_manifest(manifest: Path) -> List[Dict[str, Union[str, float]]]: result = [] with manifest.open() as f: @@ -40,7 +42,8 @@ def __init__( self.extention = extention def read_manifest(self): - input_files = [str(self.raw_data_dir / video) for video in self.raw_data_dir.rglob('*.' + self.extention)] + input_files = [str(self.raw_data_dir / video) for video in \ + self.raw_data_dir.rglob('*.' + self.extention)] v_df = pd.DataFrame({self.output_field: input_files}) return v_df.values @@ -157,44 +160,36 @@ def __init__( self.output_text_field = output_text_field self.device = device self.batch_size = batch_size - - def process(self): - import torch - import whisper # pip install -U openai-whisper - self.whisper = whisper - if self.device is None: if torch.cuda.is_available(): self.device = "cuda" else: self.device = "cpu" - - self.model = self.whisper.load_model(self.pretrained_model) - - manifest = load_manifest(Path(self.input_manifest_file)) + self.model = whisper.load_model(self.pretrained_model) + + def process(self): + json_list = load_manifest(Path(self.input_manifest_file)) Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True) with Path(self.output_manifest_file).open('w') as f: - for item in tqdm(manifest): - text_hyp, lang = self.whisper_infer(item["audio_filepath"]) - # print(f"Detected language: {lang}") - item[self.output_text_field] = text_hyp + for item in tqdm(json_list): + pred_text, pred_lang = self.whisper_infer(item["audio_filepath"]) + + item[self.output_text_field] = pred_text f.write(json.dumps(item, ensure_ascii=False) + '\n') def whisper_infer(self, audio_path): - audio = self.whisper.load_audio(audio_path) + audio = whisper.load_audio(audio_path) - audio = self.whisper.pad_or_trim(audio) - mel = self.whisper.log_mel_spectrogram(audio) + audio = whisper.pad_or_trim(audio) + mel = whisper.log_mel_spectrogram(audio) mel = mel.to(self.device) _, probs = self.model.detect_language(mel) lang = max(probs, key=probs.get) - options = self.whisper.DecodingOptions() - result = self.whisper.decode(self.model, mel, options) + options = whisper.DecodingOptions() + result = whisper.decode(self.model, mel, options) return result.text, lang - - \ No newline at end of file From 6b4a9a61c2032fc5d331d68a092f2ca3da9171ff Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Fri, 10 Nov 2023 08:49:57 -0800 Subject: [PATCH 027/115] split Signed-off-by: Nikolay Karpov --- dataset_configs/commoncrawl/big_de.yaml | 25 ++++++-- dataset_configs/commoncrawl/big_fr.yaml | 25 ++++++-- dataset_configs/commoncrawl/big_pl.yaml | 25 ++++++-- .../datasets/commoncrawl/__init__.py | 3 +- .../datasets/commoncrawl/commoncrawl.py | 61 +++++++++++++++++++ 5 files changed, 126 insertions(+), 13 deletions(-) diff --git a/dataset_configs/commoncrawl/big_de.yaml b/dataset_configs/commoncrawl/big_de.yaml index d1643b23..7686277b 100644 --- a/dataset_configs/commoncrawl/big_de.yaml +++ b/dataset_configs/commoncrawl/big_de.yaml @@ -1,17 +1,18 @@ processors_to_run: "0:" # ü ä ö ß Ä Ö Ü -workspace_dir: /mnt/md1/out/de # /mnt/md0/common_crawl/cc_sdp/de +lang: de +workspace_dir: /mnt/md1/out/${lang} # /mnt/md0/common_crawl/cc_sdp/de processors: - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue input_manifest_file: /mnt/md1/out/manifest11.json output_manifest_file: ${workspace_dir}/manifest0.json input_field: audio_lang - target_value: de + target_value: ${lang} - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue output_manifest_file: ${workspace_dir}/manifest1.json input_field: text_lang - target_value: de + target_value: ${lang} - _target_: sdp.processors.ASRInference output_manifest_file: ${workspace_dir}/manifest2.json @@ -140,4 +141,20 @@ processors: text_key: text pred_text_key: pred_text cer_threshold: 30 - \ No newline at end of file + + - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC + output_manifest_file: ${workspace_dir}/manifest19.json + lang: ${lang} + data_split: train + + - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC + input_manifest_file: ${workspace_dir}/manifest18.json + output_manifest_file: ${workspace_dir}/manifest19_dev.json + lang: ${lang} + data_split: dev + + - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC + input_manifest_file: ${workspace_dir}/manifest18.json + output_manifest_file: ${workspace_dir}/manifest19_test.json + lang: ${lang} + data_split: test \ No newline at end of file diff --git a/dataset_configs/commoncrawl/big_fr.yaml b/dataset_configs/commoncrawl/big_fr.yaml index 5cf70f42..1f81ab38 100644 --- a/dataset_configs/commoncrawl/big_fr.yaml +++ b/dataset_configs/commoncrawl/big_fr.yaml @@ -1,17 +1,18 @@ processors_to_run: "0:" -workspace_dir: /mnt/md1/out/fr #/mnt/md0/common_crawl/cc_sdp/fr +lang: fr +workspace_dir: /mnt/md1/out/${lang} #/mnt/md0/common_crawl/cc_sdp/fr processors: - _target_: sdp.processors.datasets.cc.cc.PreserveByValue input_manifest_file: /mnt/md1/out/manifest11.json output_manifest_file: ${workspace_dir}/manifest0.json input_field: audio_lang - target_value: fr + target_value: ${lang} - _target_: sdp.processors.datasets.cc.cc.PreserveByValue output_manifest_file: ${workspace_dir}/manifest1.json input_field: text_lang - target_value: fr + target_value: ${lang} - _target_: sdp.processors.DuplicateFields output_manifest_file: ${workspace_dir}/manifest2.json @@ -145,4 +146,20 @@ processors: text_key: text pred_text_key: pred_text cer_threshold: 30 - \ No newline at end of file + + - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC + output_manifest_file: ${workspace_dir}/manifest20.json + lang: ${lang} + data_split: train + + - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC + input_manifest_file: ${workspace_dir}/manifest19.json + output_manifest_file: ${workspace_dir}/manifest20_dev.json + lang: ${lang} + data_split: dev + + - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC + input_manifest_file: ${workspace_dir}/manifest19.json + output_manifest_file: ${workspace_dir}/manifest20_test.json + lang: ${lang} + data_split: test \ No newline at end of file diff --git a/dataset_configs/commoncrawl/big_pl.yaml b/dataset_configs/commoncrawl/big_pl.yaml index 29a590cb..ff2f7847 100644 --- a/dataset_configs/commoncrawl/big_pl.yaml +++ b/dataset_configs/commoncrawl/big_pl.yaml @@ -1,17 +1,18 @@ processors_to_run: "0:" -workspace_dir: /mnt/md1/out/pl #/mnt/md0/common_crawl/cc_sdp/pl +lang: pl +workspace_dir: /mnt/md1/out/${lang} #/mnt/md0/common_crawl/cc_sdp/pl processors: - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue input_manifest_file: /mnt/md1/out/manifest11.json output_manifest_file: ${workspace_dir}/manifest0.json input_field: audio_lang - target_value: pl + target_value: ${lang} - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue output_manifest_file: ${workspace_dir}/manifest1.json input_field: text_lang - target_value: pl + target_value: ${lang} - _target_: sdp.processors.ASRInference output_manifest_file: ${workspace_dir}/manifest2.json @@ -125,4 +126,20 @@ processors: text_key: text pred_text_key: pred_text cer_threshold: 30 - \ No newline at end of file + + - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC + output_manifest_file: ${workspace_dir}/manifest17.json + lang: ${lang} + data_split: train + + - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC + input_manifest_file: ${workspace_dir}/manifest16.json + output_manifest_file: ${workspace_dir}/manifest17_dev.json + lang: ${lang} + data_split: dev + + - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC + input_manifest_file: ${workspace_dir}/manifest16.json + output_manifest_file: ${workspace_dir}/manifest17_test.json + lang: ${lang} + data_split: test \ No newline at end of file diff --git a/sdp/processors/datasets/commoncrawl/__init__.py b/sdp/processors/datasets/commoncrawl/__init__.py index 22ed086d..55877778 100644 --- a/sdp/processors/datasets/commoncrawl/__init__.py +++ b/sdp/processors/datasets/commoncrawl/__init__.py @@ -15,4 +15,5 @@ from .commoncrawl import UseSonar, BLEUScore, Subprocess, NmtSubprocess, PreserveByValue, \ Lang2Iso, SplitByVttSentence, SplitByVtt, AudioLid, TextLid, AllVttText, TxtToVtt, \ ReadParquet, CreateInitialManifestCC, FfmpegConvert, ASR_HF, AlignerSubprocess, \ - SplitByAligner, JoinBy, EvalBandwidth, CreateInitialManifestExt, AudioDuration + SplitByAligner, JoinBy, EvalBandwidth, CreateInitialManifestExt, AudioDuration, \ + TrainDevTestSplitCC diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py index f587d920..49d68a70 100644 --- a/sdp/processors/datasets/commoncrawl/commoncrawl.py +++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py @@ -18,6 +18,67 @@ from sdp.processors.datasets.commoncrawl.harv_utils import ffmpeg_convert, txt2vtt, make_trans_list, get_vtt_text, text2lid, load_manifest, read_jsonl, write_jsonl, split_by_vtt_new, audio_duration from scipy.spatial import distance +class TrainDevTestSplitCC(BaseParallelProcessor): + """Custom train-dev-test split for CORAAL dataset. + + Split is done speaker-wise, so the same speakers don't appear in different + splits. + + Args: + data_split (str): train, dev or test. + lang (str): language to process. + + Returns: + All the same fields as in the input manifest, but only a subset of + the data is retained. + """ + + def __init__( + self, + data_split: str, + lang: str, + **kwargs, + ): + super().__init__(**kwargs) + if data_split not in ["train", "dev", "test"]: + raise ValueError("data_split has to be either train, dev or test") + self.data_split = data_split + self.lang = lang + + self.split_map = {} + self.split_map["en"] = {} + self.split_map["de"] = {} + self.split_map["de"]["dev"] = set( + ['0383522', '0327835', '0327898', '0619871', '0387103', '0854766', '0738911', '0739038', '0854558', '0505561', '0735963', '0086041', '0967593', '0114210', '0098270', '0387140', '0917035', '0327745', '0914212', '0739071'] + ) + self.split_map["de"]["test"] = set( + ['0076939', '0589098', '0916988', '0268959', '0085896', '0327813', '0085897', '0739103', '0502188', '0034822', '0327729', '0572412', '0327680', '0027277', '0324720', '0209876', '0027226', '0268926', '0209776', '0738970'] + ) + self.split_map["pl"] = {} + self.split_map["pl"]["dev"] = set( + ['0977373', '0949141', '0455759', '0357429', '0401864', '0714974', '0422716', '0363476', '0714976', '0927100'] + ) + self.split_map["pl"]["test"] = set( + ['0157903', '0115644', '0774572', '0688432', '0258376', '0396163', '0456013', '0571489', '0157653', '0062567'] + ) + self.split_map["fr"] = {} + self.split_map["fr"]["dev"] = set( + ['0588135', '0706751', '0533213', '0920924', '0355413', '0985711', '0113477', '0533044', '0089551', '0944509', '0944576', '0766533', '0263084', '0113490', '0647104', '0273918', '0473607', '0706753', '0800223', '0300105', '0944416', '0566712', '0533102', '0177064', '0029651', '0215767', '0054412', '0236920', '0885068', '0296098', '0113592', '0706610', '0473383', '0330163', '0681542', '0272523', '0985709', '0564446', '0944481', '0587986', '0804060', '0236908', '0969694', '0054058', '0800671', '0236923', '0986025', '0770086', '0825692', '0968870', '0152315', '0533147', '0647027', '0029342', '0272698', '0153863', '0355323', '0988779', '0985959', '0237013', '0338134', '0885097', '0507678', '0507687', '0944485', '0825768', '0742440', '0969664', '0885089', '0117211', '0296044', '0985958', '0214384', '0021267', '0565392', '0388467', '0151715', '0861950', '0112768', '0113596', '0621657', '0236860', '0647128', '0058479', '0803614', '0177501', '0533110', '0566787', '0944496', '0859701', '0885165', '0212639', '0054532', '0919263', '0740701'] + ) + self.split_map["fr"]["test"] = set( + ['0473649', '0390470', '0296024', '0355365', '0314592', '0682498', '0534637', '0270580', '0532999', '0373977', '0622032', '0825761', '0923303', '0113485', '0825868', '0473710', '0511698', '0844353', '0801733', '0091695', '0452351', '0825872', '0969173', '0986055', '0970208', '0141266', '0149629', '0296117', '0153112', '0801752', '0030816', '0508766', '0029390', '0825877', '0271152', '0388655', '0743376', '0177466', '0153032', '0329945', '0473606', '0986015', '0096178', '0089561', '0440564', '0741466', '0499703', '0272514', '0944571', '0919512', '0646950', '0533215', '0760703', '0733028', '0113488', '0825739', '0492402', '0214463', '0154278', '0801877', '0825675', '0675029', '0801729', '0414446', '0054425', '0279176', '0296100', '0355317', '0733026', '0089548', '0177502', '0851638', '0851640', '0448606', '0803096', '0766603', '0507914', '0092173', '0647061', '0473564', '0706765', '0766538', '0295994', '0851630', '0029358', '0647062', '0825838', '0153786', '0944526', '0944484', '0588046', '0706820', '0177465', '0622092', '0332657', '0944480'] + ) + + def process_dataset_entry(self, data_entry): + file_id = os.path.splitext(data_entry["audio_filepath"])[0].split("/")[-2] + if self.data_split == "train": + if file_id not in self.split_map[self.lang]["dev"] and file_id not in self.split_map[self.lang]["test"]: + return [DataEntry(data=data_entry)] + else: + if file_id in self.split_map[self.lang][self.data_split]: + return [DataEntry(data=data_entry)] + return [] + class JoinBy(BaseProcessor): """ From 860ed6a6f0c60bbb1daa5860ac9215454ef73531 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Fri, 10 Nov 2023 08:51:36 -0800 Subject: [PATCH 028/115] it nl eu Signed-off-by: Nikolay Karpov --- dataset_configs/commoncrawl/big_eu.yaml | 113 ++++++++++++++++++ dataset_configs/commoncrawl/big_it.yaml | 150 ++++++++++++++++++++++++ dataset_configs/commoncrawl/big_nl.yaml | 128 ++++++++++++++++++++ 3 files changed, 391 insertions(+) create mode 100644 dataset_configs/commoncrawl/big_eu.yaml create mode 100644 dataset_configs/commoncrawl/big_it.yaml create mode 100644 dataset_configs/commoncrawl/big_nl.yaml diff --git a/dataset_configs/commoncrawl/big_eu.yaml b/dataset_configs/commoncrawl/big_eu.yaml new file mode 100644 index 00000000..fc7e8e49 --- /dev/null +++ b/dataset_configs/commoncrawl/big_eu.yaml @@ -0,0 +1,113 @@ +processors_to_run: "0:" +workspace_dir: /mnt/md0/common_crawl/cc_sdp/eu + +processors: + - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue + input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8.json + output_manifest_file: ${workspace_dir}/manifest0.json + input_field: audio_lang + target_value: eu + + - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue + output_manifest_file: ${workspace_dir}/manifest1.json + input_field: text_lang + target_value: eu + + - _target_: sdp.processors.datasets.commoncrawl.ASR_HF + output_manifest_file: ${workspace_dir}/manifest2.json + pretrained_model: cahya/wav2vec2-large-xlsr-basque + output_text_field: pred_text + batch_size: 16 + + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest3.json + duplicate_fields: {"text":"orig_text"} + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest4.json + text_key: text + regex_params_list: + - {"pattern": '\[(.*?)\]', "repl": ' '} + - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"} + - {"pattern": '‚', "repl": ","} + - {"pattern": "’", "repl": "'"} + - {"pattern": "[-–—]", "repl": " "} + - {"pattern": '―', "repl": "-"} + - {"pattern": '—', "repl": "-"} + - {"pattern": '⁺', "repl": "+"} + - {"pattern": '“', "repl": '"'} + - {"pattern": '”', "repl": '"'} + - {"pattern": '…', "repl": '.'} + - {"pattern": '‘', "repl": "'"} + - {"pattern": '′', "repl": "'"} + - {"pattern": '`', "repl": "'"} + - {"pattern": '⁻', "repl": "-"} + - {"pattern": '‑', "repl": "-"} + - {"pattern": '¶', "repl": ' '} + - {"pattern": '«', "repl": '"'} + - {"pattern": '»', "repl": '"'} + - {"pattern": '„', "repl": '"'} + - {"pattern": '®', "repl": ' '} + - {"pattern": '@', "repl": " "} + - {"pattern": " ", "repl": " "} + + - _target_: sdp.processors.DropHighLowWordrate + output_manifest_file: ${workspace_dir}/manifest5.json + high_wordrate_threshold: 100 + low_wordrate_threshold: 0.01 + + - _target_: sdp.processors.DropIfRegexMatch + output_manifest_file: ${workspace_dir}/manifest6.json + text_key: text + regex_patterns: + - "^\\s*$" + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest7.json + text_key: text + regex_params_list: + - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"} + - {"pattern": "^\\s*'*\\s*", "repl": ""} + - {"pattern": "'{2,}", "repl": "'"} + - {"pattern": '!', "repl": '.'} + - {"pattern": '\s(\\x[a-h][0-9]){1,}\s', "repl": ' '} + - {"pattern": '(\\x[a-h][0-9]){1,}', "repl": ''} + - {"pattern": '\.{3}', "repl": '.'} + - {"pattern": '\$', "repl": ""} + - {"pattern": "[^a-zA-ZóÓáÁéÉíÍñÑúÚüÜçÇ'.,?]", "repl": " "} + - {"pattern": ' ', "repl": " "} + + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest8.json + duplicate_fields: {"text":"text_pc"} + + - _target_: sdp.processors.SubMakeLowercase + output_manifest_file: ${workspace_dir}/manifest9.json + text_key: text + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest10.json + text_key: text + regex_params_list: + - {"pattern": "[\\?\\.]", "repl": " "} + - {"pattern": ",", "repl": " "} + - {"pattern": " ", "repl": " "} + + - _target_: sdp.processors.DropIfRegexMatch + output_manifest_file: ${workspace_dir}/manifest11.json + text_key: text + regex_patterns: + - "^\\s*$" + + - _target_: sdp.processors.DropHighWER + output_manifest_file: ${workspace_dir}/manifest12.json + text_key: text + pred_text_key: pred_text + wer_threshold: 75 + + - _target_: sdp.processors.DropHighCER + output_manifest_file: ${workspace_dir}/manifest13.json + text_key: text + pred_text_key: pred_text + cer_threshold: 30 + \ No newline at end of file diff --git a/dataset_configs/commoncrawl/big_it.yaml b/dataset_configs/commoncrawl/big_it.yaml new file mode 100644 index 00000000..d95e835f --- /dev/null +++ b/dataset_configs/commoncrawl/big_it.yaml @@ -0,0 +1,150 @@ +processors_to_run: "0:" +workspace_dir: /mnt/md0/common_crawl/cc_sdp/it + +processors: + - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue + input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8.json + output_manifest_file: ${workspace_dir}/manifest0.json + input_field: audio_lang + target_value: it + + - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue + output_manifest_file: ${workspace_dir}/manifest1.json + input_field: text_lang + target_value: it + + - _target_: sdp.processors.ASRInference + output_manifest_file: ${workspace_dir}/manifest2.json + pretrained_model: nvidia/stt_it_fastconformer_hybrid_large_pc + batch_size: 64 + + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest3.json + duplicate_fields: {"text":"orig_text"} + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest4.json + text_key: text + regex_params_list: + - {"pattern": '\[(.*?)\]', "repl": ' '} + - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"} + - {"pattern": '‚', "repl": ","} + - {"pattern": "’", "repl": "'"} + - {"pattern": "[-–—]", "repl": " "} + - {"pattern": '―', "repl": "-"} + - {"pattern": '—', "repl": "-"} + - {"pattern": '⁺', "repl": "+"} + - {"pattern": '“', "repl": '"'} + - {"pattern": '”', "repl": '"'} + - {"pattern": '…', "repl": '.'} + - {"pattern": '‘', "repl": "'"} + - {"pattern": '′', "repl": "'"} + - {"pattern": '`', "repl": "'"} + - {"pattern": '⁻', "repl": "-"} + - {"pattern": '‑', "repl": "-"} + - {"pattern": '¶', "repl": ' '} + - {"pattern": '«', "repl": '"'} + - {"pattern": '»', "repl": '"'} + - {"pattern": '„', "repl": '"'} + - {"pattern": '®', "repl": ' '} + - {"pattern": '•', "repl": " "} + - {"pattern": '●', "repl": " "} + - {"pattern": '@', "repl": " "} + - {"pattern": " ", "repl": " "} + + - _target_: sdp.processors.DropHighLowWordrate + output_manifest_file: ${workspace_dir}/manifest5.json + text_key: text + high_wordrate_threshold: 100 + low_wordrate_threshold: 0.01 + + - _target_: sdp.processors.DropIfRegexMatch + output_manifest_file: ${workspace_dir}/manifest6.json + text_key: text + regex_patterns: + - "^\\s*$" + + - _target_: sdp.processors.datasets.commoncrawl.Subprocess + output_manifest_file: ${workspace_dir}/manifest7.json + input_manifest_arg: "--manifest" + output_manifest_arg: "--output_filename" + arg_separator: "=" + cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \ + --language=es --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache \ + --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/it/data/whitelist.tsv" + # --overwrite_cache + + - _target_: sdp.processors.RenameFields + output_manifest_file: ${workspace_dir}/manifest8.json + rename_fields: {"normalized":"text"} + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest9.json + text_key: text + regex_params_list: + - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"} + - {"pattern": "^\\s*'*\\s*", "repl": ""} + - {"pattern": "'{2,}", "repl": "'"} + - {"pattern": '!', "repl": '.'} + - {"pattern": '\s(\\x[a-h][0-9]){1,}\s', "repl": ' '} + - {"pattern": '(\\x[a-h][0-9]){1,}', "repl": ''} + - {"pattern": '\.{3}', "repl": '.'} + - {"pattern": '\$', "repl": ""} + - {"pattern": "[^a-zA-ZàèéìíîòóùúÀÈÉÌÍÎÒÓÙÚ'.,?]", "repl": " "} + - {"pattern": ' ', "repl": " "} + test_cases: + - {input: {text: "' jupiter and venus both shining in the golden rosy sky"}, output: {text: "jupiter and venus both shining in the golden rosy sky"}} + - {input: {text: "' may all the gold i have ever dreamed of be yours '"}, output: {text: "may all the gold i have ever dreamed of be yours"}} + - {input: {text: "''cause it''s an adult novel versus ya"}, output: {text: "cause it's an adult novel versus ya"}} + + + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest10.json + duplicate_fields: {"text":"text_pc"} + + - _target_: sdp.processors.SubMakeLowercase + output_manifest_file: ${workspace_dir}/manifest11.json + text_key: text + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest12.json + text_key: text + regex_params_list: + - {"pattern": "[\\?\\.]", "repl": " "} + - {"pattern": ",", "repl": " "} + - {"pattern": " ", "repl": " "} + + - _target_: sdp.processors.DropIfRegexMatch + output_manifest_file: ${workspace_dir}/manifest13.json + text_key: text + regex_patterns: + - "^\\s*$" + + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest14.json + duplicate_fields: {"pred_text":"pred_text_pc"} + + - _target_: sdp.processors.SubMakeLowercase + output_manifest_file: ${workspace_dir}/manifest15.json + text_key: pred_text + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest16.json + text_key: pred_text + regex_params_list: + - {"pattern": "[\\?\\.]", "repl": " "} + - {"pattern": ",", "repl": " "} + - {"pattern": " ", "repl": " "} + + - _target_: sdp.processors.DropHighWER + output_manifest_file: ${workspace_dir}/manifest17.json + text_key: text + pred_text_key: pred_text + wer_threshold: 75 + + - _target_: sdp.processors.DropHighCER + output_manifest_file: ${workspace_dir}/manifest18.json + text_key: text + pred_text_key: pred_text + cer_threshold: 30 + \ No newline at end of file diff --git a/dataset_configs/commoncrawl/big_nl.yaml b/dataset_configs/commoncrawl/big_nl.yaml new file mode 100644 index 00000000..254b1694 --- /dev/null +++ b/dataset_configs/commoncrawl/big_nl.yaml @@ -0,0 +1,128 @@ +processors_to_run: "0:" +workspace_dir: /mnt/md0/common_crawl/cc_sdp/nl + +processors: + - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue + input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8.json + output_manifest_file: ${workspace_dir}/manifest0.json + input_field: audio_lang + target_value: nl + + - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue + output_manifest_file: ${workspace_dir}/manifest1.json + input_field: text_lang + target_value: nl + + - _target_: sdp.processors.datasets.commoncrawl.ASR_HF + output_manifest_file: ${workspace_dir}/manifest2.json + pretrained_model: jonatasgrosman/wav2vec2-large-xlsr-53-dutch + output_text_field: pred_text + batch_size: 16 + + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest3.json + duplicate_fields: {"text":"orig_text"} + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest4.json + text_key: text + regex_params_list: + - {"pattern": '\[(.*?)\]', "repl": ' '} + - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"} + - {"pattern": 'î', "repl": "i"} + - {"pattern": 'ì', "repl": "i"} + - {"pattern": 'è', "repl": "e"} + - {"pattern": 'È', "repl": "E"} + - {"pattern": 'ù', "repl": "u"} + - {"pattern": 'ò', "repl": "o"} + - {"pattern": 'à', "repl": "a"} + - {"pattern": '‚', "repl": ","} + - {"pattern": "’", "repl": "'"} + - {"pattern": "[-–—]", "repl": " "} + - {"pattern": '―', "repl": "-"} + - {"pattern": '—', "repl": "-"} + - {"pattern": '⁺', "repl": "+"} + - {"pattern": '“', "repl": '"'} + - {"pattern": '”', "repl": '"'} + - {"pattern": '…', "repl": '.'} + - {"pattern": '‘', "repl": "'"} + - {"pattern": '′', "repl": "'"} + - {"pattern": '`', "repl": "'"} + - {"pattern": '⁻', "repl": "-"} + - {"pattern": '‑', "repl": "-"} + - {"pattern": '¶', "repl": ' '} + - {"pattern": '«', "repl": '"'} + - {"pattern": '»', "repl": '"'} + - {"pattern": '„', "repl": '"'} + - {"pattern": '®', "repl": ' '} + - {"pattern": '•', "repl": " "} + - {"pattern": '●', "repl": " "} + - {"pattern": '@', "repl": " "} + - {"pattern": " ", "repl": " "} + + - _target_: sdp.processors.DropHighLowWordrate + output_manifest_file: ${workspace_dir}/manifest5.json + text_key: text + high_wordrate_threshold: 100 + low_wordrate_threshold: 0.01 + + - _target_: sdp.processors.DropIfRegexMatch + output_manifest_file: ${workspace_dir}/manifest6.json + text_key: text + regex_patterns: + - "^\\s*$" + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest7.json + text_key: text + regex_params_list: + - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"} + - {"pattern": "^\\s*'*\\s*", "repl": ""} + - {"pattern": "'{2,}", "repl": "'"} + - {"pattern": '!', "repl": '.'} + - {"pattern": '\s(\\x[a-h][0-9]){1,}\s', "repl": ' '} + - {"pattern": '(\\x[a-h][0-9]){1,}', "repl": ''} + - {"pattern": '\.{3}', "repl": '.'} + - {"pattern": '\$', "repl": ""} + - {"pattern": "[^a-zA-ZóÓáÁéÉíÍúÚöÖäÄëËïÏüÜ'.,?]", "repl": " "} + - {"pattern": ' ', "repl": " "} + test_cases: + - {input: {text: "' jupiter and venus both shining in the golden rosy sky"}, output: {text: "jupiter and venus both shining in the golden rosy sky"}} + - {input: {text: "' may all the gold i have ever dreamed of be yours '"}, output: {text: "may all the gold i have ever dreamed of be yours"}} + - {input: {text: "''cause it''s an adult novel versus ya"}, output: {text: "cause it's an adult novel versus ya"}} + + + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest8.json + duplicate_fields: {"text":"text_pc"} + + - _target_: sdp.processors.SubMakeLowercase + output_manifest_file: ${workspace_dir}/manifest9.json + text_key: text + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest10.json + text_key: text + regex_params_list: + - {"pattern": "[\\?\\.]", "repl": " "} + - {"pattern": ",", "repl": " "} + - {"pattern": " ", "repl": " "} + + - _target_: sdp.processors.DropIfRegexMatch + output_manifest_file: ${workspace_dir}/manifest11.json + text_key: text + regex_patterns: + - "^\\s*$" + + - _target_: sdp.processors.DropHighWER + output_manifest_file: ${workspace_dir}/manifest12.json + text_key: text + pred_text_key: pred_text + wer_threshold: 75 + + - _target_: sdp.processors.DropHighCER + output_manifest_file: ${workspace_dir}/manifest13.json + text_key: text + pred_text_key: pred_text + cer_threshold: 30 + \ No newline at end of file From 17953c491fa399b4e8550b8e14c38ece10d52f8d Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Mon, 13 Nov 2023 07:53:55 -0800 Subject: [PATCH 029/115] TrainDevTestSplitCC Signed-off-by: Nikolay Karpov --- .../datasets/commoncrawl/commoncrawl.py | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py index f587d920..6e8a60b9 100644 --- a/sdp/processors/datasets/commoncrawl/commoncrawl.py +++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py @@ -18,6 +18,47 @@ from sdp.processors.datasets.commoncrawl.harv_utils import ffmpeg_convert, txt2vtt, make_trans_list, get_vtt_text, text2lid, load_manifest, read_jsonl, write_jsonl, split_by_vtt_new, audio_duration from scipy.spatial import distance +class TrainDevTestSplitCC(BaseParallelProcessor): + """Custom train-dev-test split for CORAAL dataset. + + Split is done speaker-wise, so the same speakers don't appear in different + splits. + + Args: + data_split (str): train, dev or test. + + Returns: + All the same fields as in the input manifest, but only a subset of + the data is retained. + """ + + def __init__( + self, + data_split: str, + **kwargs, + ): + super().__init__(**kwargs) + if data_split not in ["train", "dev", "test"]: + raise ValueError("data_split has to be either train, dev or test") + self.data_split = data_split + self.split_map = {} + self.split_map["dev"] = set( + ['0383522', '0327835', '0327898', '0619871', '0387103', '0854766', '0738911', '0739038', '0854558', '0505561', '0735963', '0086041', '0967593', '0114210', '0098270', '0387140', '0917035', '0327745', '0914212', '0739071'] + ) + self.split_map["test"] = set( + ['0076939', '0589098', '0916988', '0268959', '0085896', '0327813', '0085897', '0739103', '0502188', '0034822', '0327729', '0572412', '0327680', '0027277', '0324720', '0209876', '0027226', '0268926', '0209776', '0738970'] + ) + + def process_dataset_entry(self, data_entry): + file_id = os.path.splitext(data_entry["audio_filepath"])[0].split("/")[-2] + if self.data_split == "train": + if file_id not in self.split_map["dev"] and file_id not in self.split_map["test"]: + return [DataEntry(data=data_entry)] + else: + if file_id in self.split_map[self.data_split]: + return [DataEntry(data=data_entry)] + return [] + class JoinBy(BaseProcessor): """ From b69bfc10c86262cb4d19d4135810a075ef9e25e2 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Wed, 15 Nov 2023 04:53:21 -0800 Subject: [PATCH 030/115] en split Signed-off-by: Nikolay Karpov --- dataset_configs/commoncrawl/big_en.yaml | 18 +++++++++++++++++- dataset_configs/commoncrawl/big_es.yaml | 12 ++++++------ .../datasets/commoncrawl/commoncrawl.py | 6 ++++++ 3 files changed, 29 insertions(+), 7 deletions(-) diff --git a/dataset_configs/commoncrawl/big_en.yaml b/dataset_configs/commoncrawl/big_en.yaml index c0dfd514..3e3a5ec6 100644 --- a/dataset_configs/commoncrawl/big_en.yaml +++ b/dataset_configs/commoncrawl/big_en.yaml @@ -295,4 +295,20 @@ processors: text_key: text pred_text_key: text_asr_pred cer_threshold: 30 - \ No newline at end of file + + - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC + output_manifest_file: ${workspace_dir}/manifest20_train.json + lang: ${lang} + data_split: train + + - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC + input_manifest_file: ${workspace_dir}/manifest19.json + output_manifest_file: ${workspace_dir}/manifest20_dev.json + lang: ${lang} + data_split: dev + + - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC + input_manifest_file: ${workspace_dir}/manifest19.json + output_manifest_file: ${workspace_dir}/manifest20_test.json + lang: ${lang} + data_split: test \ No newline at end of file diff --git a/dataset_configs/commoncrawl/big_es.yaml b/dataset_configs/commoncrawl/big_es.yaml index a588b180..dda3e771 100644 --- a/dataset_configs/commoncrawl/big_es.yaml +++ b/dataset_configs/commoncrawl/big_es.yaml @@ -1,14 +1,14 @@ processors_to_run: "0:" -workspace_dir: /mnt/md0/common_crawl/cc_sdp/es +workspace_dir: /mnt/md1/out/es #/mnt/md0/common_crawl/cc_sdp/es processors: - - _target_: sdp.processors.datasets.cc.cc.PreserveByValue + - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest9a.json output_manifest_file: ${workspace_dir}/manifest0.json input_field: audio_lang target_value: es - - _target_: sdp.processors.datasets.cc.cc.PreserveByValue + - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue output_manifest_file: ${workspace_dir}/manifest1.json input_field: text_lang target_value: es @@ -72,7 +72,7 @@ processors: regex_patterns: - "^\\s*$" - - _target_: sdp.processors.datasets.cc.cc.Subprocess + - _target_: sdp.processors.datasets.commoncrawl.Subprocess output_manifest_file: ${workspace_dir}/manifest7.json input_manifest_arg: "--manifest" output_manifest_arg: "--output_filename" @@ -180,8 +180,8 @@ processors: - _target_: sdp.processors.DropHighLowDuration output_manifest_file: ${workspace_dir}/manifest23.json - high_duration_threshold: 40 - low_duration_threshold: 0.02 + high_duration_threshold: 60 + low_duration_threshold: 0.01 duration_key: nfa_duration - _target_: sdp.processors.ASRInference diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py index 49d68a70..949c371a 100644 --- a/sdp/processors/datasets/commoncrawl/commoncrawl.py +++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py @@ -47,6 +47,12 @@ def __init__( self.split_map = {} self.split_map["en"] = {} + self.split_map["en"]["dev"] = set( + ['0634236', '0693626', '0029743', '0881322', '0357427', '0455788', '0198472', '0496259', '0812890', '0142281', '0076612', '0629004', '0931592', '0577447', '0768107', '0907768', '0963898', '0671754', '0851569', '0896715'] + ) + self.split_map["en"]["test"] = set( + ['0366790', '0837221', '0733702', '0278253', '0738313', '0437256', '0558223', '0292533', '0777911', '0826607', '0544257', '0744206', '0576248', '0307575', '0307577', '0879895', '0006783', '0006755', '0125649', '0896701'] + ) self.split_map["de"] = {} self.split_map["de"]["dev"] = set( ['0383522', '0327835', '0327898', '0619871', '0387103', '0854766', '0738911', '0739038', '0854558', '0505561', '0735963', '0086041', '0967593', '0114210', '0098270', '0387140', '0917035', '0327745', '0914212', '0739071'] From 381328595a88739476e365dc975c8ff77426aee8 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Thu, 16 Nov 2023 02:10:56 -0800 Subject: [PATCH 031/115] rm pandas Signed-off-by: Nikolay Karpov --- sdp/processors/datasets/arm/armenian.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/sdp/processors/datasets/arm/armenian.py b/sdp/processors/datasets/arm/armenian.py index 95745512..011e8f00 100644 --- a/sdp/processors/datasets/arm/armenian.py +++ b/sdp/processors/datasets/arm/armenian.py @@ -44,13 +44,10 @@ def __init__( def read_manifest(self): input_files = [str(self.raw_data_dir / video) for video in \ self.raw_data_dir.rglob('*.' + self.extention)] - v_df = pd.DataFrame({self.output_field: input_files}) - return v_df.values + return input_files def process_dataset_entry(self, data_entry): - (inputf) = data_entry - - data = {self.output_field: inputf[0]} + data = {self.output_field: data_entry} return [DataEntry(data=data)] From 5d30d6a71624d28293f0926a5feb868e01e442c9 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Fri, 17 Nov 2023 08:20:04 -0800 Subject: [PATCH 032/115] text processing for MCV PR Signed-off-by: Nikolay Karpov --- dataset_configs/armenian/text.yaml | 92 ++++++++++++ sdp/processors/datasets/arm/__init__.py | 2 +- sdp/processors/datasets/arm/armenian.py | 181 +++++++++++++++++++++++- 3 files changed, 273 insertions(+), 2 deletions(-) create mode 100644 dataset_configs/armenian/text.yaml diff --git a/dataset_configs/armenian/text.yaml b/dataset_configs/armenian/text.yaml new file mode 100644 index 00000000..ec3093a5 --- /dev/null +++ b/dataset_configs/armenian/text.yaml @@ -0,0 +1,92 @@ +processors_to_run: "0:" +workspace_dir: /mnt/ssd8/arm/txt + +processors: + - _target_: sdp.processors.datasets.arm.CreateInitialManifestByExt + raw_data_dir: /home/nkarpov/workspace/NeMo-speech-data-processor/dataset_configs/armenian/docs + extention: txt + output_field: source_filepath + output_manifest_file: ${workspace_dir}/manifest0.json + + - _target_: sdp.processors.datasets.arm.ReadTxt + input_field: source_filepath + output_field: text_line + output_manifest_file: ${workspace_dir}/manifest1.json + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest2.json + text_key: text_line + regex_params_list: + - {"pattern": '։', "repl": ':'} + - {"pattern": '․', "repl": "."} + - {"pattern": '—', "repl": "-"} + - {"pattern": '–', "repl": "-"} + - {"pattern": '―', "repl": "-"} + - {"pattern": '\.\.\.', "repl": "…"} + - {"pattern": " ", "repl": " "} + + - _target_: sdp.processors.datasets.arm.SplitBySentence + input_field: text_line + output_field: text + pattern: ':|\.|…' + output_manifest_file: ${workspace_dir}/manifest3.json + + - _target_: sdp.processors.DropIfRegexMatch + output_manifest_file: ${workspace_dir}/manifest4.json + regex_patterns: + - '[0-9]' + - '\(' + - '\)' + - '\[' + - '\]' + - '\*' + - '"' + - '«' + - '»' + - '[А-Яа-я]' + - '[A-Za-z]' + - '\+' + - '=' + - '¬' + - '&' + + - _target_: sdp.processors.DropNonAlphabet + output_manifest_file: ${workspace_dir}/manifest5.json + alphabet: "՝՞՜՛`֊´’'՚-ԱԲԳԴԵԶԷԸԹԺԻԼԽԾԿՀՁՂՃՄՅՆՇՈՉՊՋՌՍՎՏՐՑՒՓՔՕՖՈՒԵ աբգդեզէըթժիլխծկհձղճմյնշոչպջռսվտրցւփքօֆուև,:\\.…;" + test_cases: + - {input: {text: "test тест Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}, output: null} + - {input: {text: "Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}, output: {text: "Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}} + + - _target_: sdp.processors.datasets.arm.NumWords + output_manifest_file: ${workspace_dir}/manifest6.json + alphabet: "ԱԲԳԴԵԶԷԸԹԺԻԼԽԾԿՀՁՂՃՄՅՆՇՈՉՊՋՌՍՎՏՐՑՒՓՔՕՖՈՒԵ աբգդեզէըթժիլխծկհձղճմյնշոչպջռսվտրցւփքօֆուև" + input_field: text + output_field: num_words + + - _target_: sdp.processors.datasets.arm.PreserveByValue + output_manifest_file: ${workspace_dir}/manifest7.json + input_field: num_words + target_value: 15 + operator: le + + - _target_: sdp.processors.datasets.arm.PreserveByValue + output_manifest_file: ${workspace_dir}/manifest8.json + input_field: num_words + target_value: 3 + operator: ge + + - _target_: sdp.processors.datasets.arm.GetSource + output_manifest_file: ${workspace_dir}/manifest9.json + input_field: source_filepath + output_field: Source + + - _target_: sdp.processors.RenameFields + output_manifest_file: ${workspace_dir}/manifest10.json + rename_fields: {"text": "Sentence"} + + - _target_: sdp.processors.KeepOnlySpecifiedFields + output_manifest_file: ${workspace_dir}/manifest11.json + fields_to_keep: ["Sentence", "Source"] + + - _target_: sdp.processors.datasets.arm.MakeTsv + output_manifest_file: ${workspace_dir}/manifest11.tsv diff --git a/sdp/processors/datasets/arm/__init__.py b/sdp/processors/datasets/arm/__init__.py index 9f1dd5cc..e82b381f 100644 --- a/sdp/processors/datasets/arm/__init__.py +++ b/sdp/processors/datasets/arm/__init__.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .armenian import CreateInitialManifestByExt, FfmpegConvert, AudioDuration, ASR_Whisper +from .armenian import CreateInitialManifestByExt, FfmpegConvert, AudioDuration, ASR_Whisper, SplitBySentence, NumWords, PreserveByValue, ReadTxt, GetSource, MakeTsv diff --git a/sdp/processors/datasets/arm/armenian.py b/sdp/processors/datasets/arm/armenian.py index 011e8f00..690058b0 100644 --- a/sdp/processors/datasets/arm/armenian.py +++ b/sdp/processors/datasets/arm/armenian.py @@ -2,16 +2,19 @@ import whisper # pip install -U openai-whisper import os import json +import re import pandas as pd from tqdm import tqdm from pathlib import Path import soundfile as sf import subprocess from typing import Dict, List, Union +from operator import lt, le, eq, ne, ge, gt from sdp.processors.base_processor import BaseProcessor, BaseParallelProcessor, DataEntry from sdp.logging import logger + def load_manifest(manifest: Path) -> List[Dict[str, Union[str, float]]]: result = [] @@ -189,4 +192,180 @@ def whisper_infer(self, audio_path): options = whisper.DecodingOptions() result = whisper.decode(self.model, mel, options) return result.text, lang - \ No newline at end of file + +class ReadTxt(BaseParallelProcessor): + """ + Args: + input_field (str): where to get path to wav file. + output_field (str): where to put to frequency bandwidth. + """ + def __init__( + self, + input_field: str, + output_field: str, + **kwargs, + ): + super().__init__(**kwargs) + self.input_field = input_field + self.output_field = output_field + + def process_dataset_entry(self, data_entry): + fname = data_entry[self.input_field] + data_list = [] + with open(fname, "r") as f: + for line in f: + line = line.strip() + if line: + data = data_entry.copy() + data[self.output_field] = line + data_list.append(DataEntry(data=data)) + return data_list + + +class SplitBySentence(BaseParallelProcessor): + """ + Args: + input_field (str): where to get path to wav file. + output_field (str): where to put to frequency bandwidth. + """ + def __init__( + self, + input_field: str, + output_field: str, + pattern: str, + **kwargs, + ): + super().__init__(**kwargs) + self.input_field = input_field + self.output_field = output_field + self.pattern = re.compile(pattern) + + def process_dataset_entry(self, data_entry): + line = data_entry[self.input_field] + data_list = [] + start = 0 + ends = [m.start() for m in self.pattern.finditer(line)] + if ends: + for end in ends: + sent = line[start:end+1].strip() + # if sent and sent[0].isupper(): + data = data_entry.copy() + data[self.output_field] = sent + data_list.append(DataEntry(data=data)) + start = end+1 + else: + data = data_entry.copy() + data[self.output_field] = line.strip() + data_list.append(DataEntry(data=data)) + return data_list + +class NumWords(BaseParallelProcessor): + """ + Args: + input_field (str): where to get path to wav file. + output_field (str): where to put to frequency bandwidth. + """ + def __init__( + self, + input_field: str, + output_field: str, + alphabet: str, + **kwargs, + ): + super().__init__(**kwargs) + self.input_field = input_field + self.output_field = output_field + self.pattern = re.compile("[^"+alphabet+"]") + + def process_dataset_entry(self, data_entry): + text = data_entry[self.input_field] + cleaned_string = self.pattern.sub(' ', text) + cleaned_string = re.sub(' ', ' ', cleaned_string).strip() + words = cleaned_string.split() + num_words = len(words) + data_entry[self.output_field] = num_words + return [DataEntry(data=data_entry)] + + +class PreserveByValue(BaseParallelProcessor): + """ + Args: + resampled_audio_dir (str): where to put re-sampled and trimmed wav files. + """ + def __init__( + self, + input_field: str, + target_value: Union[int, str], + operator: str = "eq", + **kwargs, + ): + super().__init__(**kwargs) + self.input_field = input_field + self.target_value = target_value + if operator == "lt": + self.operator = lt + elif operator == "le": + self.operator = le + elif operator == "eq": + self.operator = eq + elif operator == "ne": + self.operator = ne + elif operator == "ge": + self.operator = ge + elif operator == "gt": + self.operator = gt + + def process_dataset_entry(self, data_entry): + input_value = data_entry[self.input_field] + target = self.target_value + if self.operator(input_value, target): + return [DataEntry(data=data_entry)] + else: + return [DataEntry(data=None)] + + +class GetSource(BaseParallelProcessor): + """ + Args: + input_field (str): where to get path to wav file. + output_field (str): where to put to frequency bandwidth. + """ + def __init__( + self, + input_field: str, + output_field: str, + **kwargs, + ): + super().__init__(**kwargs) + self.input_field = input_field + self.output_field = output_field + + def process_dataset_entry(self, data_entry): + input_values = os.path.splitext(data_entry[self.input_field])[0].split("/") + + data_entry[self.output_field] = input_values[-1] + ", " +input_values[-2] + if input_values[-2] == "Նար-Դոս": + data_entry[self.output_field] += " (1867 - 1933), " + "https://hy.wikisource.org/wiki/%D5%80%D5%A5%D5%B2%D5%AB%D5%B6%D5%A1%D5%AF:%D5%86%D5%A1%D6%80-%D4%B4%D5%B8%D5%BD" + elif input_values[-2] == "Ակսել Բակունց": + data_entry[self.output_field] += " (1899 - 1937), " + "https://aybuben.com/axel-bakunts" + return [DataEntry(data=data_entry)] + +def read_jsonl(manifest_file): + rec = [] + with open(manifest_file, 'r') as the_file: + for l in the_file: + rec.append(json.loads(l)) + return pd.DataFrame.from_records(rec) + +class MakeTsv(BaseProcessor): + """ + """ + def __init__( + self, + **kwargs, + ): + super().__init__(**kwargs) + + def process(self): + df1 = read_jsonl(self.input_manifest_file) + df1.to_csv(self.output_manifest_file, index=None) \ No newline at end of file From aa21b8722d6a886b636cc306de78cf598ac46939 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Fri, 17 Nov 2023 08:22:40 -0800 Subject: [PATCH 033/115] path Signed-off-by: Nikolay Karpov --- dataset_configs/armenian/text.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataset_configs/armenian/text.yaml b/dataset_configs/armenian/text.yaml index ec3093a5..ac9fbbf4 100644 --- a/dataset_configs/armenian/text.yaml +++ b/dataset_configs/armenian/text.yaml @@ -3,7 +3,7 @@ workspace_dir: /mnt/ssd8/arm/txt processors: - _target_: sdp.processors.datasets.arm.CreateInitialManifestByExt - raw_data_dir: /home/nkarpov/workspace/NeMo-speech-data-processor/dataset_configs/armenian/docs + raw_data_dir: /home/nkarpov/data/arm_docs extention: txt output_field: source_filepath output_manifest_file: ${workspace_dir}/manifest0.json From 9d5e195181d71c47933fe772d60f8b6b2221e6ba Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Tue, 21 Nov 2023 08:54:35 -0800 Subject: [PATCH 034/115] RandomPart Signed-off-by: Nikolay Karpov --- dataset_configs/armenian/text.yaml | 7 ++++++- sdp/processors/datasets/arm/__init__.py | 3 ++- sdp/processors/datasets/arm/armenian.py | 21 ++++++++++++++++++--- 3 files changed, 26 insertions(+), 5 deletions(-) diff --git a/dataset_configs/armenian/text.yaml b/dataset_configs/armenian/text.yaml index ac9fbbf4..ff16b056 100644 --- a/dataset_configs/armenian/text.yaml +++ b/dataset_configs/armenian/text.yaml @@ -34,6 +34,7 @@ processors: - _target_: sdp.processors.DropIfRegexMatch output_manifest_file: ${workspace_dir}/manifest4.json regex_patterns: + - 'տիկ\. $' - '[0-9]' - '\(' - '\)' @@ -89,4 +90,8 @@ processors: fields_to_keep: ["Sentence", "Source"] - _target_: sdp.processors.datasets.arm.MakeTsv - output_manifest_file: ${workspace_dir}/manifest11.tsv + output_manifest_file: ${workspace_dir}/manifest12.tsv + + - _target_: sdp.processors.datasets.arm.RandomPart + output_manifest_file: ${workspace_dir}/manifest13.tsv + part: 0.05 diff --git a/sdp/processors/datasets/arm/__init__.py b/sdp/processors/datasets/arm/__init__.py index e82b381f..fe02dc9a 100644 --- a/sdp/processors/datasets/arm/__init__.py +++ b/sdp/processors/datasets/arm/__init__.py @@ -12,4 +12,5 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .armenian import CreateInitialManifestByExt, FfmpegConvert, AudioDuration, ASR_Whisper, SplitBySentence, NumWords, PreserveByValue, ReadTxt, GetSource, MakeTsv +from .armenian import CreateInitialManifestByExt, FfmpegConvert, AudioDuration, ASR_Whisper, SplitBySentence, NumWords, PreserveByValue, \ + ReadTxt, GetSource, MakeTsv, RandomPart diff --git a/sdp/processors/datasets/arm/armenian.py b/sdp/processors/datasets/arm/armenian.py index 690058b0..e7556410 100644 --- a/sdp/processors/datasets/arm/armenian.py +++ b/sdp/processors/datasets/arm/armenian.py @@ -279,8 +279,8 @@ def __init__( def process_dataset_entry(self, data_entry): text = data_entry[self.input_field] - cleaned_string = self.pattern.sub(' ', text) - cleaned_string = re.sub(' ', ' ', cleaned_string).strip() + cleaned_string = self.pattern.sub('', text).strip() + cleaned_string = re.sub('\s+', ' ', cleaned_string).strip() words = cleaned_string.split() num_words = len(words) data_entry[self.output_field] = num_words @@ -368,4 +368,19 @@ def __init__( def process(self): df1 = read_jsonl(self.input_manifest_file) - df1.to_csv(self.output_manifest_file, index=None) \ No newline at end of file + df1.to_csv(self.output_manifest_file, index=None) + +class RandomPart(BaseProcessor): + """ + """ + def __init__( + self, + part: float, + **kwargs, + ): + super().__init__(**kwargs) + self.part = part + + def process(self): + df1 = pd.read_csv(self.input_manifest_file) + df1.sample(frac=self.part).to_csv(self.output_manifest_file, index=None) \ No newline at end of file From 5b7700fca8a4afcacd8feb1b027fb12360d1997a Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Fri, 24 Nov 2023 01:35:32 -0800 Subject: [PATCH 035/115] random_state Signed-off-by: Nikolay Karpov --- dataset_configs/armenian/mcv.yaml | 2 +- dataset_configs/armenian/text.yaml | 17 +++++++++++++---- sdp/processors/datasets/arm/armenian.py | 20 ++++++++++++-------- 3 files changed, 26 insertions(+), 13 deletions(-) diff --git a/dataset_configs/armenian/mcv.yaml b/dataset_configs/armenian/mcv.yaml index c865f91b..83652cf5 100644 --- a/dataset_configs/armenian/mcv.yaml +++ b/dataset_configs/armenian/mcv.yaml @@ -2,7 +2,7 @@ processors_to_run: "0:" workspace_dir: /mnt/ssd8/arm/mcv processors: - - _target_: sdp.processors.CreateInitialManifestMCV + - _target_: sdp.processors.CreateInitialManifestMCV # conda install -c conda-forge -y sox libvorbis raw_data_dir: /home/nkarpov/data/hy extract_archive_dir: /mnt/ssd8/arm/mcv/row resampled_audio_dir: /mnt/ssd8/arm/mcv/16k diff --git a/dataset_configs/armenian/text.yaml b/dataset_configs/armenian/text.yaml index ff16b056..d2affe6c 100644 --- a/dataset_configs/armenian/text.yaml +++ b/dataset_configs/armenian/text.yaml @@ -3,7 +3,7 @@ workspace_dir: /mnt/ssd8/arm/txt processors: - _target_: sdp.processors.datasets.arm.CreateInitialManifestByExt - raw_data_dir: /home/nkarpov/data/arm_docs + raw_data_dir: /home/nkarpov/workspace/arm_docs_old/arm_docs #/home/nkarpov/workspace/arm_docs extention: txt output_field: source_filepath output_manifest_file: ${workspace_dir}/manifest0.json @@ -23,7 +23,7 @@ processors: - {"pattern": '–', "repl": "-"} - {"pattern": '―', "repl": "-"} - {"pattern": '\.\.\.', "repl": "…"} - - {"pattern": " ", "repl": " "} + - {"pattern": "\\s+", "repl": " "} - _target_: sdp.processors.datasets.arm.SplitBySentence input_field: text_line @@ -34,7 +34,8 @@ processors: - _target_: sdp.processors.DropIfRegexMatch output_manifest_file: ${workspace_dir}/manifest4.json regex_patterns: - - 'տիկ\. $' + - 'տիկ\. $' + - 'Գ\. $' - '[0-9]' - '\(' - '\)' @@ -94,4 +95,12 @@ processors: - _target_: sdp.processors.datasets.arm.RandomPart output_manifest_file: ${workspace_dir}/manifest13.tsv - part: 0.05 + random_state: 100 + part: 0.01 + + - _target_: sdp.processors.DropIfRegexMatch + input_manifest_file: ${workspace_dir}/manifest11.json + output_manifest_file: ${workspace_dir}/manifest14.json + text_key: Sentence + regex_patterns: + - '^…' \ No newline at end of file diff --git a/sdp/processors/datasets/arm/armenian.py b/sdp/processors/datasets/arm/armenian.py index e7556410..32a2b5c2 100644 --- a/sdp/processors/datasets/arm/armenian.py +++ b/sdp/processors/datasets/arm/armenian.py @@ -253,6 +253,8 @@ def process_dataset_entry(self, data_entry): data[self.output_field] = sent data_list.append(DataEntry(data=data)) start = end+1 + if start Date: Fri, 24 Nov 2023 07:07:30 -0800 Subject: [PATCH 036/115] docstring Signed-off-by: Nikolay Karpov --- .../datasets/commoncrawl/commoncrawl.py | 389 +++++++++++++----- 1 file changed, 297 insertions(+), 92 deletions(-) diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py index 949c371a..243a12bc 100644 --- a/sdp/processors/datasets/commoncrawl/commoncrawl.py +++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py @@ -88,34 +88,48 @@ def process_dataset_entry(self, data_entry): class JoinBy(BaseProcessor): """ - This processor join several lines into one - input_field (str): where to get path to wav file. + This processor join several lines into one using key input_field + Args: + input_field (str): where to get path to wav file. + text_field (str): where to put resulted text. + audio_field (str): where to put resulted wav file. + + Returns: + All the same fields as in the input manifest plus audio_field """ def __init__( self, input_field: str, + text_field: str = "text", + audio_field: str = 'audio_filepath', **kwargs, ): super().__init__(**kwargs) self.input_field = input_field + self.text_field = text_field + self.audio_field = audio_field def process(self): df1 = read_jsonl(self.input_manifest_file) pattern = re.compile("\s{2,}") - df1["text"] = df1["text"].apply(lambda x: pattern.sub(" ", x).strip()) + df1[self.text_field] = df1[self.text_field].apply(lambda x: pattern.sub(" ", x).strip()) # df1["source"] = df1["audio_filepath"].apply(lambda x: x.split("/")[-2]) - df2 = pd.DataFrame(df1.groupby(self.input_field).apply(lambda in_df: " ".join(in_df["text"].tolist())), columns=["text"]).reset_index() - df2['audio_filepath'] = df2[self.input_field] - write_jsonl(df2[['audio_filepath', 'text']], self.output_manifest_file) + df2 = pd.DataFrame(df1.groupby(self.input_field).apply(lambda in_df: " ".join(in_df[self.text_field].tolist())), columns=[self.text_field]).reset_index() + df2[self.audio_field] = df2[self.input_field] + write_jsonl(df2[[self.audio_field, self.text_field]], self.output_manifest_file) class AudioDuration(BaseParallelProcessor): """ - Args: + Count audio duration using audio file path from input_field + + Args: input_field (str): where to get path to wav file. - output_field (str): where to put to frequency bandwidth. + output_field (str): where to put to audio duration. + Returns: + All the same fields as in the input manifest plus output_field """ def __init__( self, @@ -138,10 +152,15 @@ def process_dataset_entry(self, data_entry): class EvalBandwidth(BaseParallelProcessor): """ - Args: + Count audio bandwidth using audio file path from input_field + + Args: input_field (str): where to get path to wav file. output_field (str): where to put to frequency bandwidth. threshold (str): power threshold (in dB relative to peak power in spectrum bin) to estimate frequency bandwidth. + + Returns: + All the same fields as in the input manifest plus output_field. """ def __init__( self, @@ -179,12 +198,14 @@ def eval_bandwidth(self, signal, sr, threshold=-50): class SplitByAligner(BaseParallelProcessor): """ - split wav file using NFA aligner fields: nfa_start, nfa_duration + Split wav file using NFA aligner fields: nfa_start, nfa_duration - Args: + Args: input_field (str): field to get source wav file names. output_field: (str): field to put splited wav file names. splited_audio_dir (str): where to save splited wav files. + Returns: + All the same fields as in the input manifest plus output_field. """ def __init__( self, @@ -227,12 +248,15 @@ def process_dataset_entry(self, data_entry): class ASR_HF(BaseProcessor): """ - Transcribe usinf ASR model from HuggingFace. - Args: + Transcribe usinf ASR model from HuggingFace. + + Args: pretrained_model (str): name of pretrained model on HuggingFace. output_text_field (str): field to save transcription result. device (str): Inference device. batch_size (str): Inference batch size. + Returns: + All the same fields as in the input manifest plus output_text_field. """ def __init__( self, @@ -278,8 +302,9 @@ def process(self): class UseSonar(BaseProcessor): """ - Count vector distance using Sonar library. - Args: + Count vector distance using Sonar library. + + Args: input_text_field (str): field with text to process. input_audio_field (str): field with audio file path to process. output_field (str): field to save distance. @@ -288,6 +313,8 @@ class UseSonar(BaseProcessor): text_encoder_model (str): name of pretrained text encoder model. batch_size (int): batch size for inference. device (str): device to inference on it. + Returns: + All the same fields as in the input manifest plus output_field. """ def __init__( self, @@ -370,11 +397,14 @@ def process_batch(self): class BLEUScore(BaseParallelProcessor): """ - Count BLEU Score - Args: - ref_field (str): field with reference texts - hyp_field (str): field with hypotheses - output_field (str): field to save BLEU Score + Count BLEU Score. + + Args: + ref_field (str): field with reference texts + hyp_field (str): field with hypotheses + output_field (str): field to save BLEU Score + Returns: + All the same fields as in the input manifest plus output_field. """ def __init__( self, @@ -399,18 +429,27 @@ def process_dataset_entry(self, data_entry): return [DataEntry(data=data_entry)] class Subprocess(BaseProcessor): - """This processor performs subprocess. - - ASR predictions will be saved in the ``pred_text`` key. - - Args: - pretrained_model (str): the name of the pretrained NeMo ASR model - which will be used to do inference. - batch_size (int): the batch size to use for ASR inference. Defaults to 32. - - Returns: - The same data as in the input manifest with an additional field - ``pred_text`` containing ASR model's predictions. + """ + A class for handling subprocess execution with additional features for managing input and output manifests. + + Parameters: + - cmd (str): The command to be executed as a subprocess. + - input_manifest_arg (str, optional): The argument specifying the input manifest. Defaults to an empty string. + - output_manifest_arg (str, optional): The argument specifying the output manifest. Defaults to an empty string. + - arg_separator (str, optional): The separator used between argument and value. Defaults to "=". + - **kwargs: Additional keyword arguments to be passed to the base class. + + Attributes: + - input_manifest_arg (str): The argument specifying the input manifest. + - output_manifest_arg (str): The argument specifying the output manifest. + - arg_separator (str): The separator used between argument and value. + - cmd (str): The command to be executed. + + Methods: + - process(): Executes the subprocess, handling input and output manifest arguments and ensuring they are not included in the command line. + + Note: + - The `BaseProcessor` class is assumed to be the base class, providing common functionality. """ def __init__( @@ -428,7 +467,6 @@ def __init__( self.cmd = cmd def process(self): - """This will add "pred_text" key into the output manifest.""" os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True) if self.cmd.find(self.input_manifest_file) != -1 or self.cmd.find(self.output_manifest_file) != -1: logger.error("input_manifest_file "+self.input_manifest_file+" and output_manifest_file "+self.output_manifest_file+" should be exluded from cmd line!") @@ -448,18 +486,27 @@ def process(self): subprocess.run(process_args) class NmtSubprocess(Subprocess): - """This processor performs ASR inference on each utterance of the input manifest. - - ASR predictions will be saved in the ``pred_text`` key. - - Args: - pretrained_model (str): the name of the pretrained NeMo ASR model - which will be used to do inference. - batch_size (int): the batch size to use for ASR inference. Defaults to 32. - - Returns: - The same data as in the input manifest with an additional field - ``pred_text`` containing ASR model's predictions. + """ + A class for executing Neural Machine Translation (NMT) subprocess with enhanced functionality for managing input and output fields. + + Parameters: + - input_field (str): The field in the input manifest containing the source text for translation. + - output_field (str): The field to store the translated output in the output manifest. + - srctext_file (str): The file path to store the source text for translation. + - tgtout_file (str): The file path to store the translated output. + - **kwargs: Additional keyword arguments to be passed to the base class `Subprocess`. + + Attributes: + - input_field (str): The field in the input manifest containing the source text for translation. + - output_field (str): The field to store the translated output in the output manifest. + - srctext_file (str): The file path to store the source text for translation. + - tgtout_file (str): The file path to store the translated output. + + Methods: + - process(): Executes the NMT subprocess, handling source text and translation output fields. + + Note: + - This class inherits from the `Subprocess` class and extends its functionality to handle NMT-specific processing. """ def __init__( @@ -491,18 +538,23 @@ def process(self): write_jsonl(df1, self.output_manifest_file) class AlignerSubprocess(Subprocess): - """This processor performs alignment of text on each audio file in the input manifest. + """ + A class for aligning audio transcripts using an aligner subprocess with additional features for managing output fields. - Predictions will be saved in the ``output_field`` key. + Parameters: + - output_field (str): The field in the output manifest to store the aligned transcripts. + - duration_threshold (int, optional): The maximum duration threshold for audio files in seconds. Files exceeding this threshold are excluded from alignment. Defaults to 5000. + - **kwargs: Additional keyword arguments to be passed to the base class `Subprocess`. - Args: - pretrained_model (str): the name of the pretrained NeMo ASR model - which will be used to do inference. - batch_size (int): the batch size to use for ASR inference. Defaults to 32. + Attributes: + - output_field (str): The field in the output manifest to store the aligned transcripts. + - duration_threshold (int): The maximum duration threshold for audio files in seconds. - Returns: - The same data as in the input manifest with an additional field - ``pred_text`` containing ASR model's predictions. + Methods: + - process(): Executes the aligner subprocess, handling text processing, duration filtering, alignment, and manifest updates. + + Note: + - This class inherits from the `Subprocess` class and extends its functionality to handle aligner-specific processing. """ def __init__( @@ -556,8 +608,25 @@ def process(self): class PreserveByValue(BaseParallelProcessor): """ - Args: - resampled_audio_dir (str): where to put re-sampled and trimmed wav files. + A class for preserving dataset entries based on a specified condition involving a target value and an input field. + + Parameters: + - input_field (str): The field in the dataset entries to be evaluated. + - target_value (Union[int, str]): The value to compare with the input field. + - operator (str, optional): The operator to apply for comparison. Options: "lt" (less than), "le" (less than or equal to), + "eq" (equal to), "ne" (not equal to), "ge" (greater than or equal to), "gt" (greater than). Defaults to "eq". + - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. + + Attributes: + - input_field (str): The field in the dataset entries to be evaluated. + - target_value (Union[int, str]): The value to compare with the input field. + - operator (function): The operator function based on the specified operator. + + Methods: + - process_dataset_entry(data_entry): Processes a single dataset entry, preserving it based on the specified condition. + + Note: + - This class inherits from the `BaseParallelProcessor` class and extends its functionality to selectively preserve dataset entries. """ def __init__( self, @@ -592,8 +661,23 @@ def process_dataset_entry(self, data_entry): class Lang2Iso(BaseParallelProcessor): """ - Args: - resampled_audio_dir (str): where to put re-sampled and trimmed wav files. + A class for converting language names to ISO language codes in a dataset. + + Parameters: + - input_lang_field (str): The field in the dataset containing language names to be converted. + - output_lang_field (str): The field to store the corresponding ISO language codes. + - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. + + Attributes: + - input_lang_field (str): The field in the dataset containing language names to be converted. + - output_lang_field (str): The field to store the corresponding ISO language codes. + - iso_m (dict): A mapping of language names to ISO language codes. + + Methods: + - process_dataset_entry(data_entry): Processes a single dataset entry, converting language names to ISO language codes. + + Note: + - This class inherits from the `BaseParallelProcessor` class and extends its functionality to perform language name to ISO code conversion. """ def __init__( self, @@ -616,8 +700,26 @@ def process_dataset_entry(self, data_entry): class SplitByVttSentence(BaseParallelProcessor): """ - Args: - resampled_audio_dir (str): where to put re-sampled and trimmed wav files. + A class for splitting audio files based on VTT (WebVTT) sentence-level segmentation in a dataset. + + Parameters: + - splited_audio_dir (str): The directory to store the split audio files. + - source_audio_field (str): The field in the dataset containing the path to the source audio files. + - target_audio_field (str): The field to store the paths of the split audio files. + - duration_field (str): The field to store the duration of each split audio segment. + - text_field (str): The field to store the transcriptions corresponding to each split audio segment. + - vtt_field (str): The field in the dataset containing the path to the VTT (WebVTT) files for segmentation. + - proxy_fields (List[str], optional): List of additional fields to proxy from the original data entry to the split entries. Defaults to an empty list. + - duration_threshold (float, optional): The duration threshold in seconds for each split audio segment. Defaults to 10.0. + - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. + + + Methods: + - prepare(): Creates the directory to store the split audio files. + - process_dataset_entry(data_entry): Processes a single dataset entry, splitting audio based on VTT sentence-level segmentation. + + Note: + - This class inherits from the `BaseParallelProcessor` class and extends its functionality to split audio files based on VTT segmentation. """ def __init__( self, @@ -691,8 +793,26 @@ def makeDataEntry(self, data_entry, data, vtt_file, samplerate, text_c, start_c, class SplitByVtt(BaseParallelProcessor): """ - Args: - resampled_audio_dir (str): where to put re-sampled and trimmed wav files. + A class for splitting audio files based on VTT (WebVTT) segmentation in a dataset. + + Parameters: + - splited_audio_dir (str): The directory to store the split audio files. + - source_audio_field (str): The field in the dataset containing the path to the source audio files. + - text_lang_field (str): The field in the dataset containing the language information of the text. + - audio_lang_field (str): The field in the dataset containing the language information of the audio. + - key_field (str): The field in the dataset containing a unique key for each entry. + - target_audio_field (str): The field to store the paths of the split audio files. + - duration_field (str): The field to store the duration of each split audio segment. + - text_field (str): The field to store the transcriptions corresponding to each split audio segment. + - vtt_field (str): The field in the dataset containing the path to the VTT (WebVTT) files for segmentation. + - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. + + Methods: + - prepare(): Creates the directory to store the split audio files. + - process_dataset_entry(data_entry): Processes a single dataset entry, splitting audio based on VTT segmentation. + + Note: + - This class inherits from the `BaseParallelProcessor` class and extends its functionality to split audio files based on VTT segmentation. """ def __init__( self, @@ -741,8 +861,17 @@ def process_dataset_entry(self, data_entry): class AudioLid(BaseProcessor): """ - Args: - resampled_audio_dir (str): where to put re-sampled and trimmed wav files. + A class for language identification (LID) of audio files using a pre-trained LID model. + + Args: + - input_audio_field (str): The field in the dataset containing the path to the audio files for language identification. + - pretrained_model (str): The name of the pre-trained ASR model for language identification. + - output_lang_field (str): The field to store the identified language for each audio file. + - device (str): The device to run the ASR model on (e.g., 'cuda', 'cpu'). If None, it automatically selects the available GPU if present; otherwise, it uses the CPU. + - **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`. + + Note: + - This class inherits from the `BaseProcessor` class and extends its functionality to perform language identification using a pre-trained ASR model. """ def __init__( self, @@ -792,8 +921,21 @@ def process(self): class TextLid(BaseProcessor): """ - Args: - resampled_audio_dir (str): where to put re-sampled and trimmed wav files. + A class for language identification (LID) of text using a pre-trained text classification model. + + Args: + - input_text_field (str): The field in the dataset containing the text for language identification. + - pretrained_model (str): The name or path of the pre-trained text classification model for language identification. + - output_lang_field (str): The field to store the identified language for each text. + - device (str): The device to run the text classification model on (e.g., 'cuda', 'cpu'). If None, it automatically selects the available GPU if present; otherwise, it uses the CPU. + - drop_text_duplicates (bool, optional): If True, drops duplicate texts from the output manifest. Defaults to False. + - **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`. + + Methods: + - process(): Processes the language identification for each text in the dataset and saves the results in a new manifest file. + + Note: + - This class inherits from the `BaseProcessor` class and extends its functionality to perform language identification using a pre-trained text classification model. """ def __init__( self, @@ -846,8 +988,18 @@ def process(self): class AllVttText(BaseParallelProcessor): """ - Args: - resampled_audio_dir (str): where to put re-sampled and trimmed wav files. + A class for extracting text content from VTT (WebVTT) files and updating the manifest. + + Args: + - output_text_field (str): The field to store the extracted text content in the manifest. + - input_filepath_field (str, optional): The field in the manifest containing the path to VTT files. Defaults to "vtt_filepath". + - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. + + Methods: + - process_dataset_entry(data_entry): Processes a single dataset entry, extracts text content from the specified VTT file, and updates the manifest. + + Note: + - This class inherits from the `BaseParallelProcessor` class and extends its functionality to extract text content from VTT files and update the manifest. """ def __init__( self, @@ -873,11 +1025,21 @@ def process_dataset_entry(self, data_entry): class TxtToVtt(BaseParallelProcessor): """ - Args: - raw_data_dir (str): where to put raw downloaded data. - resampled_audio_dir (str): where to put re-sampled and trimmed wav files. - target_samplerate (int): sample rate to resample to. Defaults to 16000. - target_nchannels (int): target number of channels. Defaults to 1. + A class for converting text files to WebVTT (VTT) format and updating the manifest. + + Args: + - vtt_files_dir (str): The directory where the generated VTT files will be saved. + - key_field (str): The field in the manifest representing the unique key or identifier for each entry. + - text_field (str): The field in the manifest containing the text content to be converted to VTT format. + - vtt_field (str): The field to store the generated VTT file paths in the manifest. + - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. + + Methods: + - prepare(): Creates the directory for saving the generated VTT files. + - process_dataset_entry(data_entry): Processes a single dataset entry, converts the text content to VTT format, and updates the manifest. + + Note: + - This class inherits from the `BaseParallelProcessor` class and extends its functionality to convert text files to WebVTT (VTT) format and update the manifest. """ def __init__( self, @@ -913,8 +1075,21 @@ def process_dataset_entry(self, data_entry): class ReadParquet(BaseParallelProcessor): """ - Args: - resampled_audio_dir (str): where to put re-sampled and trimmed wav files. + A class for reading information from Parquet files and updating the manifest with video URLs and captions. + + Args: + - output_video_field (str): The field to store the extracted video URLs in the manifest. + - output_caption_field (str): The field to store the extracted captions in the manifest. + - key_field (str): The field in the manifest representing the unique key or identifier for each entry. + - raw_data_dir (str): The directory containing Parquet files with information to be read. + - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. + + Methods: + - prepare(): Reads and prepares information from Parquet files, storing it in the `urls` DataFrame. + - process_dataset_entry(data_entry): Processes a single dataset entry, extracts video URLs and captions based on the key, and updates the manifest. + + Note: + - This class inherits from the `BaseParallelProcessor` class and extends its functionality to read information from Parquet files and update the manifest with video URLs and captions. """ def __init__( self, @@ -961,11 +1136,22 @@ def get_key(x): class CreateInitialManifestCC(BaseParallelProcessor): """ - Args: - raw_data_dir (str): where to put raw downloaded data. - resampled_audio_dir (str): where to put re-sampled and trimmed wav files. - target_samplerate (int): sample rate to resample to. Defaults to 16000. - target_nchannels (int): target number of channels. Defaults to 1. + A class for creating an initial dataset manifest from image and text files with common keys. + + Args: + - raw_data_dir (str): The directory containing image and text files to include in the initial dataset manifest. + - video_field (str): The field to store the paths to the image files in the dataset. + - key_field (str): The field to represent the common key or identifier for each entry. + - text_field (str): The field to store the paths to the text files in the dataset. + - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. + + Methods: + - prepare(): Creates the directory for saving the initial dataset manifest. + - read_manifest(): Reads the image and text files, extracts common keys, and creates a DataFrame with video, key, and text fields. + - process_dataset_entry(data_entry): Processes a single dataset entry, creating a DataEntry object with video, key, and text fields, and updates the dataset. + + Note: + - This class inherits from the `BaseParallelProcessor` class and extends its functionality to create an initial dataset manifest from image and text files with common keys. """ def __init__( self, @@ -1009,13 +1195,22 @@ def process_dataset_entry(self, data_entry): class FfmpegConvert(BaseParallelProcessor): """ - Args: - input_field (str): field with path to video file in the input manifest - output_field (str): field with path to audio file in the output manifest - key_field (str): field with key value - resampled_audio_dir (str): where to put re-sampled and trimmed wav files. - target_samplerate (int): sample rate to resample to. Defaults to 16000. - target_nchannels (int): target number of channels. Defaults to 1. + A class for converting video files to audio using FFmpeg and updating the dataset with the path to the resampled audio. + + Args: + - resampled_audio_dir (str): The directory to store the resampled audio files. + - input_field (str): The field in the dataset representing the path to the input video files. + - output_field (str): The field to store the path to the resampled audio files in the dataset. + - key_field (str): The field in the dataset representing the unique key or identifier for each entry. + - target_samplerate (int, optional): The target sampling rate for the resampled audio. Defaults to 16000. + - target_nchannels (int, optional): The target number of channels for the resampled audio. Defaults to 1. + - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. + + Methods: + - process_dataset_entry(data_entry): Processes a single dataset entry, converts the input video to resampled audio, and updates the dataset. + + Note: + - This class inherits from the `BaseParallelProcessor` class and extends its functionality to convert video files to resampled audio using FFmpeg. """ def __init__( self, @@ -1051,11 +1246,21 @@ def process_dataset_entry(self, data_entry): class CreateInitialManifestExt(BaseParallelProcessor): """ - Args: - raw_data_dir (str): where to put raw downloaded data. - resampled_audio_dir (str): where to put re-sampled and trimmed wav files. - target_samplerate (int): sample rate to resample to. Defaults to 16000. - target_nchannels (int): target number of channels. Defaults to 1. + A class for creating an initial dataset manifest from audio files with a specified extension. + + Args: + - raw_data_dir (str): The directory containing audio files to include in the initial dataset manifest. + - output_field (str, optional): The field to store the audio file paths in the dataset. Defaults to "audio_filepath". + - extention (str, optional): The file extension of the audio files to include in the manifest. Defaults to "mp3". + - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. + + Methods: + - prepare(): Creates the directory for saving the initial dataset manifest. + - read_manifest(): Reads the audio files with the specified extension and creates a DataFrame with the specified output field. + - process_dataset_entry(data_entry): Processes a single dataset entry, creating a DataEntry object with the audio file path, and updates the dataset. + + Note: + - This class inherits from the `BaseParallelProcessor` class and extends its functionality to create an initial dataset manifest from audio files. """ def __init__( self, From 96dfaed9b2bf954170d59e9ad932f3e57651983c Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Tue, 28 Nov 2023 03:07:00 -0800 Subject: [PATCH 037/115] split common processors Signed-off-by: Nikolay Karpov --- dataset_configs/armenian/mcv.yaml | 17 ++ dataset_configs/armenian/text.yaml | 10 +- sdp/processors/__init__.py | 10 + sdp/processors/datasets/arm/__init__.py | 3 +- sdp/processors/datasets/arm/armenian.py | 263 +----------------- .../modify_manifest/create_manifest.py | 44 +++ .../modify_manifest/data_to_data.py | 124 +++++++++ .../modify_manifest/data_to_dropbool.py | 56 +++- .../modify_manifest/speech_recognition.py | 124 +++++++++ sdp/utils/common.py | 20 +- 10 files changed, 403 insertions(+), 268 deletions(-) create mode 100644 sdp/processors/modify_manifest/create_manifest.py create mode 100644 sdp/processors/modify_manifest/speech_recognition.py diff --git a/dataset_configs/armenian/mcv.yaml b/dataset_configs/armenian/mcv.yaml index 83652cf5..b7ef21be 100644 --- a/dataset_configs/armenian/mcv.yaml +++ b/dataset_configs/armenian/mcv.yaml @@ -25,3 +25,20 @@ processors: text_key: text pred_text_key: pred_text cer_threshold: 30 + + - _target_: sdp.processors.ASR_transformer #pip install accelerate + input_manifest_file: ${workspace_dir}/manifest1.json + output_manifest_file: ${workspace_dir}/manifest4.json + pretrained_model: "distil-whisper/distil-large-v2" #"openai/whisper-large-v3" + output_text_field: pred_text3 + + - _target_: sdp.processors.DropHighWER + text_key: text + pred_text_key: pred_text3 + wer_threshold: 75 + + - _target_: sdp.processors.DropHighCER + output_manifest_file: ${workspace_dir}/manifest6.json + text_key: text + pred_text_key: pred_text3 + cer_threshold: 30 \ No newline at end of file diff --git a/dataset_configs/armenian/text.yaml b/dataset_configs/armenian/text.yaml index d2affe6c..03bb1610 100644 --- a/dataset_configs/armenian/text.yaml +++ b/dataset_configs/armenian/text.yaml @@ -2,13 +2,13 @@ processors_to_run: "0:" workspace_dir: /mnt/ssd8/arm/txt processors: - - _target_: sdp.processors.datasets.arm.CreateInitialManifestByExt + - _target_: sdp.processors.CreateInitialManifestByExt raw_data_dir: /home/nkarpov/workspace/arm_docs_old/arm_docs #/home/nkarpov/workspace/arm_docs extention: txt output_field: source_filepath output_manifest_file: ${workspace_dir}/manifest0.json - - _target_: sdp.processors.datasets.arm.ReadTxt + - _target_: sdp.processors.ReadTxt input_field: source_filepath output_field: text_line output_manifest_file: ${workspace_dir}/manifest1.json @@ -65,13 +65,13 @@ processors: input_field: text output_field: num_words - - _target_: sdp.processors.datasets.arm.PreserveByValue + - _target_: sdp.processors.PreserveByThreshold output_manifest_file: ${workspace_dir}/manifest7.json input_field: num_words target_value: 15 operator: le - - _target_: sdp.processors.datasets.arm.PreserveByValue + - _target_: sdp.processors.PreserveByThreshold output_manifest_file: ${workspace_dir}/manifest8.json input_field: num_words target_value: 3 @@ -93,7 +93,7 @@ processors: - _target_: sdp.processors.datasets.arm.MakeTsv output_manifest_file: ${workspace_dir}/manifest12.tsv - - _target_: sdp.processors.datasets.arm.RandomPart + - _target_: sdp.processors.datasets.arm.RandomTsvPart output_manifest_file: ${workspace_dir}/manifest13.tsv random_state: 100 part: 0.01 diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py index da200fc0..6a89ebff 100644 --- a/sdp/processors/__init__.py +++ b/sdp/processors/__init__.py @@ -42,12 +42,16 @@ KeepOnlySpecifiedFields, ) from sdp.processors.modify_manifest.data_to_data import ( + AudioDuration, + FfmpegConvert, + ReadTxt, InsIfASRInsertion, SubIfASRSubstitution, SubMakeLowercase, SubRegex, ) from sdp.processors.modify_manifest.data_to_dropbool import ( + PreserveByThreshold, DropASRError, DropASRErrorBeginningEnd, DropHighCER, @@ -67,3 +71,9 @@ ) from sdp.processors.nemo.asr_inference import ASRInference from sdp.processors.nemo.pc_inference import PCInference + +from sdp.processors.modify_manifest.speech_recognition import ( + ASR_transformer, + ASR_Whisper, +) +from sdp.processors.modify_manifest.create_manifest import CreateInitialManifestByExt \ No newline at end of file diff --git a/sdp/processors/datasets/arm/__init__.py b/sdp/processors/datasets/arm/__init__.py index fe02dc9a..ee3384e5 100644 --- a/sdp/processors/datasets/arm/__init__.py +++ b/sdp/processors/datasets/arm/__init__.py @@ -12,5 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .armenian import CreateInitialManifestByExt, FfmpegConvert, AudioDuration, ASR_Whisper, SplitBySentence, NumWords, PreserveByValue, \ - ReadTxt, GetSource, MakeTsv, RandomPart +from .armenian import SplitBySentence, NumWords, GetSource, MakeTsv, RandomTsvPart diff --git a/sdp/processors/datasets/arm/armenian.py b/sdp/processors/datasets/arm/armenian.py index 32a2b5c2..fcac6cee 100644 --- a/sdp/processors/datasets/arm/armenian.py +++ b/sdp/processors/datasets/arm/armenian.py @@ -6,222 +6,14 @@ import pandas as pd from tqdm import tqdm from pathlib import Path -import soundfile as sf -import subprocess from typing import Dict, List, Union from operator import lt, le, eq, ne, ge, gt from sdp.processors.base_processor import BaseProcessor, BaseParallelProcessor, DataEntry -from sdp.logging import logger +from sdp.processors.modify_manifest.common import load_manifest +from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline - -def load_manifest(manifest: Path) -> List[Dict[str, Union[str, float]]]: - result = [] - with manifest.open() as f: - for i, line in enumerate(f): - data = json.loads(line) - result.append(data) - return result - -class CreateInitialManifestByExt(BaseParallelProcessor): - """ - Args: - raw_data_dir (str): where to put raw downloaded data. - resampled_audio_dir (str): where to put re-sampled and trimmed wav files. - target_samplerate (int): sample rate to resample to. Defaults to 16000. - target_nchannels (int): target number of channels. Defaults to 1. - """ - def __init__( - self, - raw_data_dir: str, - output_field: str = "audio_filepath", - extention: str = "mp3", - **kwargs, - ): - super().__init__(**kwargs) - self.raw_data_dir = Path(raw_data_dir) - self.output_field = output_field - self.extention = extention - - def read_manifest(self): - input_files = [str(self.raw_data_dir / video) for video in \ - self.raw_data_dir.rglob('*.' + self.extention)] - return input_files - - def process_dataset_entry(self, data_entry): - data = {self.output_field: data_entry} - return [DataEntry(data=data)] - - -def ffmpeg_convert(jpg: str, wav: str, ar: int = 0, ac: int = 1): - process_args = ["ffmpeg", "-i", jpg, '-ac', str(ac), "-map", "0:a", "-c:a", "pcm_s16le", "-y", wav] - if ar: - process_args = process_args[:-1] - process_args.extend(["-ar", str(ar), wav]) - return subprocess.run(process_args, stdout = subprocess.DEVNULL, stderr = subprocess.DEVNULL) - -class FfmpegConvert(BaseParallelProcessor): - """ - Args: - input_field (str): field with path to video file in the input manifest - output_field (str): field with path to audio file in the output manifest - key_field (str): field with key value - resampled_audio_dir (str): where to put re-sampled and trimmed wav files. - target_samplerate (int): sample rate to resample to. Defaults to 16000. - target_nchannels (int): target number of channels. Defaults to 1. - """ - def __init__( - self, - resampled_audio_dir: str, - input_field: str, - output_field: str, - key_field: str = None, - target_samplerate: int = 16000, - target_nchannels: int = 1, - **kwargs, - ): - super().__init__(**kwargs) - self.input_field = input_field - self.output_field = output_field - self.key_field = key_field - self.resampled_audio_dir = resampled_audio_dir - self.target_samplerate = target_samplerate - self.target_nchannels = target_nchannels - - def prepare(self): - os.makedirs(os.path.split(self.output_manifest_file)[0], exist_ok=True) - os.makedirs(self.resampled_audio_dir, exist_ok=True) - - def process_dataset_entry(self, data_entry): - video = data_entry[self.input_field] - if self.key_field: - key = data_entry[self.key_field] - os.makedirs(os.path.join(self.resampled_audio_dir, key.split("/")[0]), exist_ok=True) - else: - key = os.path.splitext(video)[0].split("/")[-1] - audio = os.path.join(self.resampled_audio_dir, key) + ".wav" - - if not os.path.isfile(audio): - ffmpeg_convert(video, audio, self.target_samplerate, self.target_nchannels) - - data_entry[self.output_field]= audio - if self.key_field: - data_entry[self.key_field] = key - return [DataEntry(data=data_entry)] - - -class AudioDuration(BaseParallelProcessor): - """ - Args: - input_field (str): where to get path to wav file. - output_field (str): where to put to frequency bandwidth. - """ - def __init__( - self, - input_field: str, - output_field: str, - **kwargs, - ): - super().__init__(**kwargs) - self.input_field = input_field - self.output_field = output_field - - def process_dataset_entry(self, data_entry): - audio_filepath = data_entry[self.input_field] - try: - data, samplerate = sf.read(audio_filepath) - data_entry[self.output_field]=data.shape[0]/samplerate - except Exception as e: - logger.warning(str(e) + " file: " + audio_filepath) - data_entry[self.output_field] = -1.0 - return [DataEntry(data=data_entry)] - - -class ASR_Whisper(BaseProcessor): - """ - Transcribe usinf ASR model from HuggingFace. - Args: - pretrained_model (str): name of pretrained model on HuggingFace. - output_text_field (str): field to save transcription result. - device (str): Inference device. - batch_size (str): Inference batch size. - """ - def __init__( - self, - pretrained_model: str, - output_text_field: str, - device: str = None, - batch_size: str = 1, - **kwargs, - ): - super().__init__(**kwargs) - self.pretrained_model = pretrained_model - self.output_text_field = output_text_field - self.device = device - self.batch_size = batch_size - if self.device is None: - if torch.cuda.is_available(): - self.device = "cuda" - else: - self.device = "cpu" - self.model = whisper.load_model(self.pretrained_model) - - def process(self): - json_list = load_manifest(Path(self.input_manifest_file)) - - Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True) - - with Path(self.output_manifest_file).open('w') as f: - for item in tqdm(json_list): - pred_text, pred_lang = self.whisper_infer(item["audio_filepath"]) - - item[self.output_text_field] = pred_text - f.write(json.dumps(item, ensure_ascii=False) + '\n') - - def whisper_infer(self, audio_path): - audio = whisper.load_audio(audio_path) - - audio = whisper.pad_or_trim(audio) - mel = whisper.log_mel_spectrogram(audio) - mel = mel.to(self.device) - - _, probs = self.model.detect_language(mel) - lang = max(probs, key=probs.get) - - options = whisper.DecodingOptions() - result = whisper.decode(self.model, mel, options) - return result.text, lang - -class ReadTxt(BaseParallelProcessor): - """ - Args: - input_field (str): where to get path to wav file. - output_field (str): where to put to frequency bandwidth. - """ - def __init__( - self, - input_field: str, - output_field: str, - **kwargs, - ): - super().__init__(**kwargs) - self.input_field = input_field - self.output_field = output_field - - def process_dataset_entry(self, data_entry): - fname = data_entry[self.input_field] - data_list = [] - with open(fname, "r") as f: - for line in f: - line = line.strip() - if line: - data = data_entry.copy() - data[self.output_field] = line - data_list.append(DataEntry(data=data)) - return data_list - - class SplitBySentence(BaseParallelProcessor): """ Args: @@ -289,43 +81,6 @@ def process_dataset_entry(self, data_entry): return [DataEntry(data=data_entry)] -class PreserveByValue(BaseParallelProcessor): - """ - Args: - resampled_audio_dir (str): where to put re-sampled and trimmed wav files. - """ - def __init__( - self, - input_field: str, - target_value: Union[int, str], - operator: str = "eq", - **kwargs, - ): - super().__init__(**kwargs) - self.input_field = input_field - self.target_value = target_value - if operator == "lt": - self.operator = lt - elif operator == "le": - self.operator = le - elif operator == "eq": - self.operator = eq - elif operator == "ne": - self.operator = ne - elif operator == "ge": - self.operator = ge - elif operator == "gt": - self.operator = gt - - def process_dataset_entry(self, data_entry): - input_value = data_entry[self.input_field] - target = self.target_value - if self.operator(input_value, target): - return [DataEntry(data=data_entry)] - else: - return [DataEntry(data=None)] - - class GetSource(BaseParallelProcessor): """ Args: @@ -346,18 +101,8 @@ def process_dataset_entry(self, data_entry): input_values = os.path.splitext(data_entry[self.input_field])[0].split("/") data_entry[self.output_field] = input_values[-1]# + ", " +input_values[-2] - # if input_values[-2] == "Նար-Դոս": - # data_entry[self.output_field] += " (1867 - 1933), " + "https://hy.wikisource.org/wiki/%D5%80%D5%A5%D5%B2%D5%AB%D5%B6%D5%A1%D5%AF:%D5%86%D5%A1%D6%80-%D4%B4%D5%B8%D5%BD" - # elif input_values[-2] == "Ակսել Բակունց": - # data_entry[self.output_field] += " (1899 - 1937), " + "https://aybuben.com/axel-bakunts" return [DataEntry(data=data_entry)] -def read_jsonl(manifest_file): - rec = [] - with open(manifest_file, 'r') as the_file: - for l in the_file: - rec.append(json.loads(l)) - return pd.DataFrame.from_records(rec) class MakeTsv(BaseProcessor): """ @@ -369,10 +114,10 @@ def __init__( super().__init__(**kwargs) def process(self): - df1 = read_jsonl(self.input_manifest_file) + df1 = pd.DataFrame.from_records(load_manifest(self.input_manifest_file)) df1.to_csv(self.output_manifest_file, index=None, sep='\t') -class RandomPart(BaseProcessor): +class RandomTsvPart(BaseProcessor): """ """ def __init__( diff --git a/sdp/processors/modify_manifest/create_manifest.py b/sdp/processors/modify_manifest/create_manifest.py new file mode 100644 index 00000000..e9ee080c --- /dev/null +++ b/sdp/processors/modify_manifest/create_manifest.py @@ -0,0 +1,44 @@ +from pathlib import Path + +from sdp.processors.base_processor import BaseParallelProcessor, DataEntry + +class CreateInitialManifestByExt(BaseParallelProcessor): + """ + A class for creating an initial dataset manifest from image and text files with common keys. + + Args: + - raw_data_dir (str): The directory containing image and text files to include in the initial dataset manifest. + - output_field (str): The field to store the paths to the files in the dataset. + - extention (str): The field stecify extention of the file in the dataset. + - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. + + Methods: + - prepare(): Creates the directory for saving the initial dataset manifest. + - read_manifest(): Reads the image and text files, extracts common keys, and creates a DataFrame with video, key, and text fields. + - process_dataset_entry(data_entry): Processes a single dataset entry, creating a DataEntry object with video, key, and text fields, and updates the dataset. + + Note: + - This class inherits from the `BaseParallelProcessor` class and extends its functionality to create an initial dataset manifest from image and text files with common keys. + """ + + def __init__( + self, + raw_data_dir: str, + output_field: str = "audio_filepath", + extention: str = "mp3", + **kwargs, + ): + super().__init__(**kwargs) + self.raw_data_dir = Path(raw_data_dir) + self.output_field = output_field + self.extention = extention + + def read_manifest(self): + input_files = [str(self.raw_data_dir / video) for video in \ + self.raw_data_dir.rglob('*.' + self.extention)] + return input_files + + def process_dataset_entry(self, data_entry): + data = {self.output_field: data_entry} + return [DataEntry(data=data)] + \ No newline at end of file diff --git a/sdp/processors/modify_manifest/data_to_data.py b/sdp/processors/modify_manifest/data_to_data.py index d72b941d..3dcad0ea 100644 --- a/sdp/processors/modify_manifest/data_to_data.py +++ b/sdp/processors/modify_manifest/data_to_data.py @@ -14,14 +14,138 @@ import collections import re +import os from typing import Dict, List +import soundfile as sf from sdp.logging import logger from sdp.processors.base_processor import BaseParallelProcessor, DataEntry +from sdp.utils.common import ffmpeg_convert from sdp.utils.edit_spaces import add_start_end_spaces, remove_extra_spaces from sdp.utils.get_diff import get_diff_with_subs_grouped +class AudioDuration(BaseParallelProcessor): + """ + Count audio duration using audio file path from input_field + + Args: + input_field (str): where to get path to wav file. + output_field (str): where to put to audio duration. + Returns: + All the same fields as in the input manifest plus output_field + """ + def __init__( + self, + input_field: str, + output_field: str, + **kwargs, + ): + super().__init__(**kwargs) + self.input_field = input_field + self.output_field = output_field + + def process_dataset_entry(self, data_entry): + audio_filepath = data_entry[self.input_field] + try: + data, samplerate = sf.read(audio_filepath) + data_entry[self.output_field]=data.shape[0]/samplerate + except Exception as e: + logger.warning(str(e) + " file: " + audio_filepath) + data_entry[self.output_field] = -1.0 + return [DataEntry(data=data_entry)] + + +class FfmpegConvert(BaseParallelProcessor): + """ + A class for converting video files to audio using FFmpeg and updating the dataset with the path to the resampled audio. + + Args: + - resampled_audio_dir (str): The directory to store the resampled audio files. + - input_field (str): The field in the dataset representing the path to the input video files. + - output_field (str): The field to store the path to the resampled audio files in the dataset. + - key_field (str): The field in the dataset representing the unique key or identifier for each entry. + - target_samplerate (int, optional): The target sampling rate for the resampled audio. Defaults to 16000. + - target_nchannels (int, optional): The target number of channels for the resampled audio. Defaults to 1. + - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. + + Methods: + - process_dataset_entry(data_entry): Processes a single dataset entry, converts the input video to resampled audio, and updates the dataset. + + Note: + - This class inherits from the `BaseParallelProcessor` class and extends its functionality to convert video files to resampled audio using FFmpeg. + """ + def __init__( + self, + resampled_audio_dir: str, + input_field: str, + output_field: str, + key_field: str = None, + target_samplerate: int = 16000, + target_nchannels: int = 1, + **kwargs, + ): + super().__init__(**kwargs) + self.input_field = input_field + self.output_field = output_field + self.key_field = key_field + self.resampled_audio_dir = resampled_audio_dir + self.target_samplerate = target_samplerate + self.target_nchannels = target_nchannels + + def prepare(self): + os.makedirs(os.path.split(self.output_manifest_file)[0], exist_ok=True) + os.makedirs(self.resampled_audio_dir, exist_ok=True) + + def process_dataset_entry(self, data_entry): + video = data_entry[self.input_field] + if self.key_field: + key = data_entry[self.key_field] + os.makedirs(os.path.join(self.resampled_audio_dir, key.split("/")[0]), exist_ok=True) + else: + key = os.path.splitext(video)[0].split("/")[-1] + audio = os.path.join(self.resampled_audio_dir, key) + ".wav" + + if not os.path.isfile(audio): + ffmpeg_convert(video, audio, self.target_samplerate, self.target_nchannels) + + data_entry[self.output_field]= audio + if self.key_field: + data_entry[self.key_field] = key + return [DataEntry(data=data_entry)] + + +class ReadTxt(BaseParallelProcessor): + """ + Read contentn from txt file to manifest + + Args: + input_field (str): where to get path to txt file. + output_field (str): where to put content of txt file. + """ + def __init__( + self, + input_field: str, + output_field: str, + **kwargs, + ): + super().__init__(**kwargs) + self.input_field = input_field + self.output_field = output_field + + def process_dataset_entry(self, data_entry): + fname = data_entry[self.input_field] + data_list = [] + with open(fname, "r") as f: + for line in f: + line = line.strip() + if line: + data = data_entry.copy() + data[self.output_field] = line + data_list.append(DataEntry(data=data)) + return data_list + + class InsIfASRInsertion(BaseParallelProcessor): """Processor that adds substrings to transcription if they are present in ASR predictions. diff --git a/sdp/processors/modify_manifest/data_to_dropbool.py b/sdp/processors/modify_manifest/data_to_dropbool.py index 30c24d7e..a1d77c61 100644 --- a/sdp/processors/modify_manifest/data_to_dropbool.py +++ b/sdp/processors/modify_manifest/data_to_dropbool.py @@ -14,7 +14,8 @@ import collections import re -from typing import List +from typing import List, Union +from operator import lt, le, eq, ne, ge, gt from sdp.logging import logger from sdp.processors.base_processor import BaseParallelProcessor, DataEntry @@ -29,6 +30,59 @@ ) +class PreserveByThreshold(BaseParallelProcessor): + """ + A class for preserving dataset entries based on a specified condition involving a target value and an input field. + + Parameters: + - input_field (str): The field in the dataset entries to be evaluated. + - target_value (Union[int, str]): The value to compare with the input field. + - operator (str, optional): The operator to apply for comparison. Options: "lt" (less than), "le" (less than or equal to), + "eq" (equal to), "ne" (not equal to), "ge" (greater than or equal to), "gt" (greater than). Defaults to "eq". + - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. + + Attributes: + - input_field (str): The field in the dataset entries to be evaluated. + - target_value (Union[int, str]): The value to compare with the input field. + - operator (function): The operator function based on the specified operator. + + Methods: + - process_dataset_entry(data_entry): Processes a single dataset entry, preserving it based on the specified condition. + + Note: + - This class inherits from the `BaseParallelProcessor` class and extends its functionality to selectively preserve dataset entries. + """ + def __init__( + self, + input_field: str, + target_value: Union[int, str], + operator: str = "eq", + **kwargs, + ): + super().__init__(**kwargs) + self.input_field = input_field + self.target_value = target_value + if operator == "lt": + self.operator = lt + elif operator == "le": + self.operator = le + elif operator == "eq": + self.operator = eq + elif operator == "ne": + self.operator = ne + elif operator == "ge": + self.operator = ge + elif operator == "gt": + self.operator = gt + + def process_dataset_entry(self, data_entry): + input_value = data_entry[self.input_field] + target = self.target_value + if self.operator(input_value, target): + return [DataEntry(data=data_entry)] + else: + return [DataEntry(data=None)] + class DropHighLowCharrate(BaseParallelProcessor): """Drops utterances if their character rate is too low or too high. diff --git a/sdp/processors/modify_manifest/speech_recognition.py b/sdp/processors/modify_manifest/speech_recognition.py new file mode 100644 index 00000000..6c9f7e84 --- /dev/null +++ b/sdp/processors/modify_manifest/speech_recognition.py @@ -0,0 +1,124 @@ +import json +import torch +import whisper # pip install -U openai-whisper +from tqdm import tqdm +from pathlib import Path +from sdp.processors.base_processor import BaseProcessor +from sdp.utils.common import load_manifest +from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline + +class ASR_Whisper(BaseProcessor): + """ + Transcribe usinf ASR model from HuggingFace. + + Args: + pretrained_model (str): name of pretrained model on HuggingFace. + output_text_field (str): field to save transcription result. + device (str): Inference device. + batch_size (str): Inference batch size. + """ + def __init__( + self, + pretrained_model: str, + output_text_field: str, + device: str = None, + batch_size: str = 1, + **kwargs, + ): + super().__init__(**kwargs) + self.pretrained_model = pretrained_model + self.output_text_field = output_text_field + self.device = device + self.batch_size = batch_size + if self.device is None: + if torch.cuda.is_available(): + self.device = "cuda" + else: + self.device = "cpu" + self.model = whisper.load_model(self.pretrained_model) + + def process(self): + json_list = load_manifest(Path(self.input_manifest_file)) + + Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True) + + with Path(self.output_manifest_file).open('w') as f: + for item in tqdm(json_list): + pred_text, pred_lang = self.whisper_infer(item["audio_filepath"]) + + item[self.output_text_field] = pred_text + f.write(json.dumps(item, ensure_ascii=False) + '\n') + + def whisper_infer(self, audio_path): + audio = whisper.load_audio(audio_path) + + audio = whisper.pad_or_trim(audio) + mel = whisper.log_mel_spectrogram(audio) + mel = mel.to(self.device) + + _, probs = self.model.detect_language(mel) + lang = max(probs, key=probs.get) + + options = whisper.DecodingOptions() + result = whisper.decode(self.model, mel, options) + return result.text, lang + +class ASR_transformer(BaseProcessor): + """ + Transcribe usinf ASR model from HuggingFace. + Args: + pretrained_model (str): name of pretrained model on HuggingFace. + output_text_field (str): field to save transcription result. + device (str): Inference device. + batch_size (str): Inference batch size. + """ + def __init__( + self, + pretrained_model: str, + output_text_field: str, + device: str = None, + batch_size: str = 1, + **kwargs, + ): + super().__init__(**kwargs) + self.pretrained_model = pretrained_model + self.output_text_field = output_text_field + self.device = device + self.batch_size = batch_size + if self.device is None: + if torch.cuda.is_available(): + self.device = "cuda:0" + else: + self.device = "cpu" + + torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 + self.model = AutoModelForSpeechSeq2Seq.from_pretrained(self.pretrained_model, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True) + self.model.to(self.device) + + processor = AutoProcessor.from_pretrained(self.pretrained_model) + self.pipe = pipeline( + "automatic-speech-recognition", + model=self.model, + tokenizer=processor.tokenizer, + feature_extractor=processor.feature_extractor, + max_new_tokens=128, + chunk_length_s=30, + batch_size=16, + return_timestamps=True, + torch_dtype=torch_dtype, + device=self.device, + ) + + def process(self): + + json_list = load_manifest(Path(self.input_manifest_file)) + + Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True) + + with Path(self.output_manifest_file).open('w') as f: + for item in tqdm(json_list): + pred_text = self.pipe(item["audio_filepath"])["text"] + # print(pred_text) + + item[self.output_text_field] = pred_text + f.write(json.dumps(item, ensure_ascii=False) + '\n') \ No newline at end of file diff --git a/sdp/utils/common.py b/sdp/utils/common.py index 45f04242..eb70a071 100644 --- a/sdp/utils/common.py +++ b/sdp/utils/common.py @@ -13,14 +13,32 @@ # limitations under the License. import os +import json import tarfile import urllib import zipfile - +import subprocess import wget +from pathlib import Path +from typing import Dict, List, Union from sdp.logging import logger +def load_manifest(manifest: Path) -> List[Dict[str, Union[str, float]]]: + # read NeMo manifest as a list of dicts + result = [] + with manifest.open() as f: + for line in f: + data = json.loads(line) + result.append(data) + return result + +def ffmpeg_convert(jpg: str, wav: str, ar: int = 0, ac: int = 1): + process_args = ["ffmpeg", "-i", jpg, '-ac', str(ac), "-map", "0:a", "-c:a", "pcm_s16le", "-y", wav] + if ar: + process_args = process_args[:-1] + process_args.extend(["-ar", str(ar), wav]) + return subprocess.run(process_args, stdout = subprocess.DEVNULL, stderr = subprocess.DEVNULL) def download_file(source_url: str, target_directory: str, verbose = True): # make sure target_directory is an absolute path to avoid bugs when we change directories to download data later From 424edf7cec2fc06fef4114346685fb65703a6835 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Tue, 28 Nov 2023 03:24:50 -0800 Subject: [PATCH 038/115] langs Signed-off-by: Nikolay Karpov --- sdp/processors/{datasets/arm => langs}/__init__.py | 0 sdp/processors/{datasets/arm => langs}/armenian.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename sdp/processors/{datasets/arm => langs}/__init__.py (100%) rename sdp/processors/{datasets/arm => langs}/armenian.py (100%) diff --git a/sdp/processors/datasets/arm/__init__.py b/sdp/processors/langs/__init__.py similarity index 100% rename from sdp/processors/datasets/arm/__init__.py rename to sdp/processors/langs/__init__.py diff --git a/sdp/processors/datasets/arm/armenian.py b/sdp/processors/langs/armenian.py similarity index 100% rename from sdp/processors/datasets/arm/armenian.py rename to sdp/processors/langs/armenian.py From 0e2ca51a894cd2de02f11ddb1f85350827168f8b Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Tue, 28 Nov 2023 03:30:13 -0800 Subject: [PATCH 039/115] audio_books Signed-off-by: Nikolay Karpov --- .../armenian/{config.yaml => audio_books.yaml} | 0 dataset_configs/armenian/mcv.yaml | 2 +- dataset_configs/armenian/text.yaml | 10 +++++----- 3 files changed, 6 insertions(+), 6 deletions(-) rename dataset_configs/armenian/{config.yaml => audio_books.yaml} (100%) diff --git a/dataset_configs/armenian/config.yaml b/dataset_configs/armenian/audio_books.yaml similarity index 100% rename from dataset_configs/armenian/config.yaml rename to dataset_configs/armenian/audio_books.yaml diff --git a/dataset_configs/armenian/mcv.yaml b/dataset_configs/armenian/mcv.yaml index b7ef21be..1874ed42 100644 --- a/dataset_configs/armenian/mcv.yaml +++ b/dataset_configs/armenian/mcv.yaml @@ -10,7 +10,7 @@ processors: language_id: cv-corpus-15.0-2023-09-08-hy-AM output_manifest_file: ${workspace_dir}/manifest0.json - - _target_: sdp.processors.datasets.arm.ASR_Whisper + - _target_: sdp.processors.ASR_Whisper output_manifest_file: ${workspace_dir}/manifest1.json pretrained_model: "large-v2" output_text_field: pred_text diff --git a/dataset_configs/armenian/text.yaml b/dataset_configs/armenian/text.yaml index 03bb1610..744c1381 100644 --- a/dataset_configs/armenian/text.yaml +++ b/dataset_configs/armenian/text.yaml @@ -25,7 +25,7 @@ processors: - {"pattern": '\.\.\.', "repl": "…"} - {"pattern": "\\s+", "repl": " "} - - _target_: sdp.processors.datasets.arm.SplitBySentence + - _target_: sdp.processors.langs.armenian.SplitBySentence input_field: text_line output_field: text pattern: ':|\.|…' @@ -59,7 +59,7 @@ processors: - {input: {text: "test тест Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}, output: null} - {input: {text: "Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}, output: {text: "Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}} - - _target_: sdp.processors.datasets.arm.NumWords + - _target_: sdp.processors.langs.armenian.NumWords output_manifest_file: ${workspace_dir}/manifest6.json alphabet: "ԱԲԳԴԵԶԷԸԹԺԻԼԽԾԿՀՁՂՃՄՅՆՇՈՉՊՋՌՍՎՏՐՑՒՓՔՕՖՈՒԵ աբգդեզէըթժիլխծկհձղճմյնշոչպջռսվտրցւփքօֆուև" input_field: text @@ -77,7 +77,7 @@ processors: target_value: 3 operator: ge - - _target_: sdp.processors.datasets.arm.GetSource + - _target_: sdp.processors.langs.armenian.GetSource output_manifest_file: ${workspace_dir}/manifest9.json input_field: source_filepath output_field: Source @@ -90,10 +90,10 @@ processors: output_manifest_file: ${workspace_dir}/manifest11.json fields_to_keep: ["Sentence", "Source"] - - _target_: sdp.processors.datasets.arm.MakeTsv + - _target_: sdp.processors.langs.armenian.MakeTsv output_manifest_file: ${workspace_dir}/manifest12.tsv - - _target_: sdp.processors.datasets.arm.RandomTsvPart + - _target_: sdp.processors.langs.armenian.RandomTsvPart output_manifest_file: ${workspace_dir}/manifest13.tsv random_state: 100 part: 0.01 From 293648b149f295692e9299aaf0355a5b3500e7cc Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Tue, 28 Nov 2023 04:09:35 -0800 Subject: [PATCH 040/115] mv Signed-off-by: Nikolay Karpov --- dataset_configs/armenian/audio_books.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dataset_configs/armenian/audio_books.yaml b/dataset_configs/armenian/audio_books.yaml index 43487b8d..5143a686 100644 --- a/dataset_configs/armenian/audio_books.yaml +++ b/dataset_configs/armenian/audio_books.yaml @@ -2,13 +2,13 @@ processors_to_run: "0:" workspace_dir: /mnt/ssd8/arm processors: - - _target_: sdp.processors.datasets.arm.CreateInitialManifestByExt + - _target_: sdp.processors.CreateInitialManifestByExt raw_data_dir: /mnt/ssd8/arm/mp3 extention: mp3 output_field: source_filepath output_manifest_file: ${workspace_dir}/manifest0.json - - _target_: sdp.processors.datasets.arm.FfmpegConvert + - _target_: sdp.processors.FfmpegConvert output_manifest_file: ${workspace_dir}/manifest1.json resampled_audio_dir: ${workspace_dir}/audio target_samplerate: 16000 @@ -17,12 +17,12 @@ processors: output_field: "audio_filepath" key_field: null - - _target_: sdp.processors.datasets.arm.AudioDuration + - _target_: sdp.processors.AudioDuration input_field: audio_filepath output_field: duration output_manifest_file: ${workspace_dir}/manifest2.json - - _target_: sdp.processors.datasets.arm.ASR_Whisper + - _target_: sdp.processors.ASR_Whisper output_manifest_file: ${workspace_dir}/manifest3.json pretrained_model: "large-v2" output_text_field: text From 970b9e733acecf1d192c66648054405f9fa316eb Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Fri, 1 Dec 2023 05:47:51 -0800 Subject: [PATCH 041/115] mv todata_to_data Signed-off-by: Nikolay Karpov --- dataset_configs/armenian/text.yaml | 8 +- sdp/processors/__init__.py | 8 +- sdp/processors/langs/__init__.py | 2 - sdp/processors/langs/armenian.py | 110 +++++------------- .../modify_manifest/data_to_data.py | 107 ++++++++++++++++- .../modify_manifest/speech_recognition.py | 18 +-- 6 files changed, 152 insertions(+), 101 deletions(-) diff --git a/dataset_configs/armenian/text.yaml b/dataset_configs/armenian/text.yaml index 744c1381..8af2af0f 100644 --- a/dataset_configs/armenian/text.yaml +++ b/dataset_configs/armenian/text.yaml @@ -8,7 +8,7 @@ processors: output_field: source_filepath output_manifest_file: ${workspace_dir}/manifest0.json - - _target_: sdp.processors.ReadTxt + - _target_: sdp.processors.ReadTxtLines input_field: source_filepath output_field: text_line output_manifest_file: ${workspace_dir}/manifest1.json @@ -25,10 +25,10 @@ processors: - {"pattern": '\.\.\.', "repl": "…"} - {"pattern": "\\s+", "repl": " "} - - _target_: sdp.processors.langs.armenian.SplitBySentence + - _target_: sdp.processors.SplitLineBySentence input_field: text_line output_field: text - pattern: ':|\.|…' + end_pattern: ':|\.|…' output_manifest_file: ${workspace_dir}/manifest3.json - _target_: sdp.processors.DropIfRegexMatch @@ -59,7 +59,7 @@ processors: - {input: {text: "test тест Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}, output: null} - {input: {text: "Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}, output: {text: "Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}} - - _target_: sdp.processors.langs.armenian.NumWords + - _target_: sdp.processors.NumWords output_manifest_file: ${workspace_dir}/manifest6.json alphabet: "ԱԲԳԴԵԶԷԸԹԺԻԼԽԾԿՀՁՂՃՄՅՆՇՈՉՊՋՌՍՎՏՐՑՒՓՔՕՖՈՒԵ աբգդեզէըթժիլխծկհձղճմյնշոչպջռսվտրցւփքօֆուև" input_field: text diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py index 6a89ebff..fde752dc 100644 --- a/sdp/processors/__init__.py +++ b/sdp/processors/__init__.py @@ -44,7 +44,9 @@ from sdp.processors.modify_manifest.data_to_data import ( AudioDuration, FfmpegConvert, - ReadTxt, + ReadTxtLines, + SplitLineBySentence, + NumWords, InsIfASRInsertion, SubIfASRSubstitution, SubMakeLowercase, @@ -73,7 +75,7 @@ from sdp.processors.nemo.pc_inference import PCInference from sdp.processors.modify_manifest.speech_recognition import ( - ASR_transformer, - ASR_Whisper, + ASRTransformer, + ASRWhisper, ) from sdp.processors.modify_manifest.create_manifest import CreateInitialManifestByExt \ No newline at end of file diff --git a/sdp/processors/langs/__init__.py b/sdp/processors/langs/__init__.py index ee3384e5..4fc50543 100644 --- a/sdp/processors/langs/__init__.py +++ b/sdp/processors/langs/__init__.py @@ -11,5 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from .armenian import SplitBySentence, NumWords, GetSource, MakeTsv, RandomTsvPart diff --git a/sdp/processors/langs/armenian.py b/sdp/processors/langs/armenian.py index fcac6cee..39b38c2b 100644 --- a/sdp/processors/langs/armenian.py +++ b/sdp/processors/langs/armenian.py @@ -1,91 +1,23 @@ -import torch -import whisper # pip install -U openai-whisper import os -import json -import re import pandas as pd -from tqdm import tqdm -from pathlib import Path -from typing import Dict, List, Union -from operator import lt, le, eq, ne, ge, gt from sdp.processors.base_processor import BaseProcessor, BaseParallelProcessor, DataEntry from sdp.processors.modify_manifest.common import load_manifest -from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline - -class SplitBySentence(BaseParallelProcessor): - """ - Args: - input_field (str): where to get path to wav file. - output_field (str): where to put to frequency bandwidth. +class GetSource(BaseParallelProcessor): """ - def __init__( - self, - input_field: str, - output_field: str, - pattern: str, - **kwargs, - ): - super().__init__(**kwargs) - self.input_field = input_field - self.output_field = output_field - self.pattern = re.compile(pattern) + A class for extracting source information from file paths and updating the dataset. - def process_dataset_entry(self, data_entry): - line = data_entry[self.input_field] - data_list = [] - start = 0 - ends = [m.start() for m in self.pattern.finditer(line)] - if ends: - for end in ends: - sent = line[start:end+1].strip() - # if sent and sent[0].isupper(): - data = data_entry.copy() - data[self.output_field] = sent - data_list.append(DataEntry(data=data)) - start = end+1 - if start Date: Fri, 1 Dec 2023 06:36:38 -0800 Subject: [PATCH 042/115] mv torch Signed-off-by: Nikolay Karpov --- sdp/processors/modify_manifest/speech_recognition.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdp/processors/modify_manifest/speech_recognition.py b/sdp/processors/modify_manifest/speech_recognition.py index a1c31f21..a70616d3 100644 --- a/sdp/processors/modify_manifest/speech_recognition.py +++ b/sdp/processors/modify_manifest/speech_recognition.py @@ -1,5 +1,4 @@ import json -import torch from tqdm import tqdm from pathlib import Path from sdp.processors.base_processor import BaseProcessor @@ -24,6 +23,7 @@ def __init__( **kwargs, ): super().__init__(**kwargs) + import torch import whisper # pip install -U openai-whisper self.pretrained_model = pretrained_model From a41218014662f4cd4f22f4b312b954679a407f0f Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Mon, 4 Dec 2023 21:59:14 -0800 Subject: [PATCH 043/115] PR comments Signed-off-by: Nikolay Karpov --- dataset_configs/armenian/audio_books.yaml | 14 ++--- dataset_configs/armenian/mcv.yaml | 10 +-- dataset_configs/armenian/text.yaml | 50 +++++++-------- sdp/processors/__init__.py | 2 +- sdp/processors/langs/armenian.py | 29 ++++----- .../modify_manifest/create_manifest.py | 10 +-- .../modify_manifest/data_to_data.py | 61 +++++++++---------- .../modify_manifest/speech_recognition.py | 4 +- 8 files changed, 87 insertions(+), 93 deletions(-) diff --git a/dataset_configs/armenian/audio_books.yaml b/dataset_configs/armenian/audio_books.yaml index 5143a686..710806e6 100644 --- a/dataset_configs/armenian/audio_books.yaml +++ b/dataset_configs/armenian/audio_books.yaml @@ -1,10 +1,10 @@ processors_to_run: "0:" -workspace_dir: /mnt/ssd8/arm +workspace_dir: /path/to/your/audio/books #/mnt/ssd8/arm processors: - _target_: sdp.processors.CreateInitialManifestByExt - raw_data_dir: /mnt/ssd8/arm/mp3 - extention: mp3 + raw_data_dir: ${workspace_dir}/mp3 + extension: mp3 output_field: source_filepath output_manifest_file: ${workspace_dir}/manifest0.json @@ -17,9 +17,9 @@ processors: output_field: "audio_filepath" key_field: null - - _target_: sdp.processors.AudioDuration - input_field: audio_filepath - output_field: duration + - _target_: sdp.processors.GetAudioDuration + audio_filepath_field: audio_filepath + duration_field: duration output_manifest_file: ${workspace_dir}/manifest2.json - _target_: sdp.processors.ASR_Whisper @@ -53,7 +53,7 @@ processors: - {"pattern": 'թարգմանություն', "repl": "թարգմանությունը"} - {"pattern": 'արտաշ է սեմինի', "repl": "Արտաշես Էմինի"} # double space to single space - - {"pattern": " ", "repl": " "} + - {"pattern": "\s+", "repl": " "} test_cases: - {input: {text: "Գրիմ եղբայրներ, անտարի թնակը, Ռուսերենիս թարգմանեց, ամալիահ Ուկասյանը."}, output: {text: "Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}} - {input: {text: "Էտկարպո, Մատնիչ սիրտը, թարգմանություն արտաշ է սեմինի."}, output: {text: "Էդգար Պո, Մատնիչ սիրտը, թարգմանությունը Արտաշես Էմինի."}} diff --git a/dataset_configs/armenian/mcv.yaml b/dataset_configs/armenian/mcv.yaml index 1874ed42..6dbf0e58 100644 --- a/dataset_configs/armenian/mcv.yaml +++ b/dataset_configs/armenian/mcv.yaml @@ -1,16 +1,16 @@ processors_to_run: "0:" -workspace_dir: /mnt/ssd8/arm/mcv +workspace_dir: /path/to/your/mcv/files #/mnt/ssd8/arm/mcv processors: - _target_: sdp.processors.CreateInitialManifestMCV # conda install -c conda-forge -y sox libvorbis raw_data_dir: /home/nkarpov/data/hy - extract_archive_dir: /mnt/ssd8/arm/mcv/row - resampled_audio_dir: /mnt/ssd8/arm/mcv/16k + extract_archive_dir: ${workspace_dir}/row + resampled_audio_dir: ${workspace_dir}/16k data_split: train language_id: cv-corpus-15.0-2023-09-08-hy-AM output_manifest_file: ${workspace_dir}/manifest0.json - - _target_: sdp.processors.ASR_Whisper + - _target_: sdp.processors.ASRWhisper output_manifest_file: ${workspace_dir}/manifest1.json pretrained_model: "large-v2" output_text_field: pred_text @@ -26,7 +26,7 @@ processors: pred_text_key: pred_text cer_threshold: 30 - - _target_: sdp.processors.ASR_transformer #pip install accelerate + - _target_: sdp.processors.ASRTransformer #pip install accelerate input_manifest_file: ${workspace_dir}/manifest1.json output_manifest_file: ${workspace_dir}/manifest4.json pretrained_model: "distil-whisper/distil-large-v2" #"openai/whisper-large-v3" diff --git a/dataset_configs/armenian/text.yaml b/dataset_configs/armenian/text.yaml index 8af2af0f..e1e5e3f7 100644 --- a/dataset_configs/armenian/text.yaml +++ b/dataset_configs/armenian/text.yaml @@ -1,16 +1,16 @@ processors_to_run: "0:" -workspace_dir: /mnt/ssd8/arm/txt +workspace_dir: /path/to/your/txt/files processors: - _target_: sdp.processors.CreateInitialManifestByExt raw_data_dir: /home/nkarpov/workspace/arm_docs_old/arm_docs #/home/nkarpov/workspace/arm_docs - extention: txt + extension: txt output_field: source_filepath output_manifest_file: ${workspace_dir}/manifest0.json - _target_: sdp.processors.ReadTxtLines - input_field: source_filepath - output_field: text_line + source_filepath: source_filepath + text_key: text_line output_manifest_file: ${workspace_dir}/manifest1.json - _target_: sdp.processors.SubRegex @@ -25,14 +25,17 @@ processors: - {"pattern": '\.\.\.', "repl": "…"} - {"pattern": "\\s+", "repl": " "} + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest3.json + duplicate_fields: {"text_line":"text"} + - _target_: sdp.processors.SplitLineBySentence - input_field: text_line - output_field: text + text_key: text end_pattern: ':|\.|…' - output_manifest_file: ${workspace_dir}/manifest3.json + output_manifest_file: ${workspace_dir}/manifest4.json - _target_: sdp.processors.DropIfRegexMatch - output_manifest_file: ${workspace_dir}/manifest4.json + output_manifest_file: ${workspace_dir}/manifest5.json regex_patterns: - 'տիկ\. $' - 'Գ\. $' @@ -53,54 +56,47 @@ processors: - '&' - _target_: sdp.processors.DropNonAlphabet - output_manifest_file: ${workspace_dir}/manifest5.json + output_manifest_file: ${workspace_dir}/manifest6.json alphabet: "՝՞՜՛`֊´’'՚-ԱԲԳԴԵԶԷԸԹԺԻԼԽԾԿՀՁՂՃՄՅՆՇՈՉՊՋՌՍՎՏՐՑՒՓՔՕՖՈՒԵ աբգդեզէըթժիլխծկհձղճմյնշոչպջռսվտրցւփքօֆուև,:\\.…;" test_cases: - {input: {text: "test тест Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}, output: null} - {input: {text: "Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}, output: {text: "Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}} - _target_: sdp.processors.NumWords - output_manifest_file: ${workspace_dir}/manifest6.json + output_manifest_file: ${workspace_dir}/manifest7.json alphabet: "ԱԲԳԴԵԶԷԸԹԺԻԼԽԾԿՀՁՂՃՄՅՆՇՈՉՊՋՌՍՎՏՐՑՒՓՔՕՖՈՒԵ աբգդեզէըթժիլխծկհձղճմյնշոչպջռսվտրցւփքօֆուև" input_field: text output_field: num_words - _target_: sdp.processors.PreserveByThreshold - output_manifest_file: ${workspace_dir}/manifest7.json + output_manifest_file: ${workspace_dir}/manifest8.json input_field: num_words target_value: 15 operator: le - _target_: sdp.processors.PreserveByThreshold - output_manifest_file: ${workspace_dir}/manifest8.json + output_manifest_file: ${workspace_dir}/manifest9.json input_field: num_words target_value: 3 operator: ge - _target_: sdp.processors.langs.armenian.GetSource - output_manifest_file: ${workspace_dir}/manifest9.json - input_field: source_filepath - output_field: Source + output_manifest_file: ${workspace_dir}/manifest10.json + source_filepath: source_filepath + source_field: Source - _target_: sdp.processors.RenameFields - output_manifest_file: ${workspace_dir}/manifest10.json + output_manifest_file: ${workspace_dir}/manifest11.json rename_fields: {"text": "Sentence"} - _target_: sdp.processors.KeepOnlySpecifiedFields - output_manifest_file: ${workspace_dir}/manifest11.json + output_manifest_file: ${workspace_dir}/manifest12.json fields_to_keep: ["Sentence", "Source"] - _target_: sdp.processors.langs.armenian.MakeTsv - output_manifest_file: ${workspace_dir}/manifest12.tsv + output_manifest_file: ${workspace_dir}/manifest13.tsv - _target_: sdp.processors.langs.armenian.RandomTsvPart - output_manifest_file: ${workspace_dir}/manifest13.tsv + output_manifest_file: ${workspace_dir}/manifest14.tsv random_state: 100 - part: 0.01 - - - _target_: sdp.processors.DropIfRegexMatch - input_manifest_file: ${workspace_dir}/manifest11.json - output_manifest_file: ${workspace_dir}/manifest14.json - text_key: Sentence - regex_patterns: - - '^…' \ No newline at end of file + part: 0.01 \ No newline at end of file diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py index fde752dc..17249df0 100644 --- a/sdp/processors/__init__.py +++ b/sdp/processors/__init__.py @@ -42,7 +42,7 @@ KeepOnlySpecifiedFields, ) from sdp.processors.modify_manifest.data_to_data import ( - AudioDuration, + GetAudioDuration, FfmpegConvert, ReadTxtLines, SplitLineBySentence, diff --git a/sdp/processors/langs/armenian.py b/sdp/processors/langs/armenian.py index 39b38c2b..fede4669 100644 --- a/sdp/processors/langs/armenian.py +++ b/sdp/processors/langs/armenian.py @@ -1,44 +1,45 @@ import os import pandas as pd +from pathlib import Path from sdp.processors.base_processor import BaseProcessor, BaseParallelProcessor, DataEntry -from sdp.processors.modify_manifest.common import load_manifest +from sdp.utils.common import load_manifest class GetSource(BaseParallelProcessor): """ - A class for extracting source information from file paths and updating the dataset. + Processor for extracting source information from file paths and updating the manifest. Args: - - input_field (str): The field containing the file path in the dataset. - - output_field (str): The field to store the extracted source information in the dataset. + - source_filepath (str): The field containing the file path in the manifest. + - source_field (str): The field to store the extracted source information in the manifest. - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. Methods: - - process_dataset_entry(data_entry): Processes a single dataset entry, extracts source information, and updates the dataset. + - process_dataset_entry(data_entry): Processes a single dataset entry, extracts source information, and updates the manifest. Note: - - This class inherits from the `BaseParallelProcessor` class and extends its functionality to extract source information from file paths and update the dataset. + - This class inherits from the `BaseParallelProcessor` class and extends its functionality to extract source information from file paths and update the manifest. """ def __init__( self, - input_field: str, - output_field: str, + source_filepath: str, + source_field: str, **kwargs, ): super().__init__(**kwargs) - self.input_field = input_field - self.output_field = output_field + self.input_field = source_filepath + self.output_field = source_field def process_dataset_entry(self, data_entry): input_values = os.path.splitext(data_entry[self.input_field])[0].split("/") - data_entry[self.output_field] = input_values[-1]# + ", " +input_values[-2] + data_entry[self.output_field] = input_values[-1] return [DataEntry(data=data_entry)] class MakeTsv(BaseProcessor): """ - A class for converting a JSON manifest file to a TSV (Tab-Separated Values) file. + Processor for converting a JSON manifest file to a TSV (Tab-Separated Values) file. Args: - **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`. @@ -56,12 +57,12 @@ def __init__( super().__init__(**kwargs) def process(self): - df1 = pd.DataFrame.from_records(load_manifest(self.input_manifest_file)) + df1 = pd.DataFrame.from_records(load_manifest(Path(self.input_manifest_file))) df1.to_csv(self.output_manifest_file, index=None, sep='\t') class RandomTsvPart(BaseProcessor): """ - A class for creating a random subset of a TSV (Tab-Separated Values) file based on the specified fraction. + Processor for creating a random subset of a TSV (Tab-Separated Values) file based on the specified fraction. Args: - part (float): The fraction of the dataset to include in the random subset, should be in the range (0.0, 1.0). diff --git a/sdp/processors/modify_manifest/create_manifest.py b/sdp/processors/modify_manifest/create_manifest.py index e9ee080c..8d8fc954 100644 --- a/sdp/processors/modify_manifest/create_manifest.py +++ b/sdp/processors/modify_manifest/create_manifest.py @@ -4,12 +4,12 @@ class CreateInitialManifestByExt(BaseParallelProcessor): """ - A class for creating an initial dataset manifest from image and text files with common keys. + Processor for creating an initial dataset manifest by saving filepaths with a common extension to the field specified in output_field. Args: - raw_data_dir (str): The directory containing image and text files to include in the initial dataset manifest. - output_field (str): The field to store the paths to the files in the dataset. - - extention (str): The field stecify extention of the file in the dataset. + - extension (str): The field stecify extention of the file in the dataset. - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. Methods: @@ -25,17 +25,17 @@ def __init__( self, raw_data_dir: str, output_field: str = "audio_filepath", - extention: str = "mp3", + extension: str = "mp3", **kwargs, ): super().__init__(**kwargs) self.raw_data_dir = Path(raw_data_dir) self.output_field = output_field - self.extention = extention + self.extension = extension def read_manifest(self): input_files = [str(self.raw_data_dir / video) for video in \ - self.raw_data_dir.rglob('*.' + self.extention)] + self.raw_data_dir.rglob('*.' + self.extension)] return input_files def process_dataset_entry(self, data_entry): diff --git a/sdp/processors/modify_manifest/data_to_data.py b/sdp/processors/modify_manifest/data_to_data.py index 011b4420..3d8b0edd 100644 --- a/sdp/processors/modify_manifest/data_to_data.py +++ b/sdp/processors/modify_manifest/data_to_data.py @@ -25,40 +25,40 @@ from sdp.utils.get_diff import get_diff_with_subs_grouped -class AudioDuration(BaseParallelProcessor): +class GetAudioDuration(BaseParallelProcessor): """ - Count audio duration using audio file path from input_field + Processor to count audio duration using audio file path from input_field Args: - input_field (str): where to get path to wav file. - output_field (str): where to put to audio duration. + audio_filepath_field (str): where to get path to wav file. + duration_field (str): where to put to audio duration. Returns: All the same fields as in the input manifest plus output_field """ def __init__( self, - input_field: str, - output_field: str, + audio_filepath_field: str, + duration_field: str, **kwargs, ): super().__init__(**kwargs) - self.input_field = input_field - self.output_field = output_field + self.audio_filepath_field = audio_filepath_field + self.duration_field = duration_field def process_dataset_entry(self, data_entry): - audio_filepath = data_entry[self.input_field] + audio_filepath = data_entry[self.audio_filepath_field] try: data, samplerate = sf.read(audio_filepath) - data_entry[self.output_field]=data.shape[0]/samplerate + data_entry[self.duration_field]=data.shape[0]/samplerate except Exception as e: logger.warning(str(e) + " file: " + audio_filepath) - data_entry[self.output_field] = -1.0 + data_entry[self.duration_field] = -1.0 return [DataEntry(data=data_entry)] class FfmpegConvert(BaseParallelProcessor): """ - A class for converting video files to audio using FFmpeg and updating the dataset with the path to the resampled audio. + Processor for converting video files to audio using FFmpeg and updating the dataset with the path to the resampled audio. Args: - resampled_audio_dir (str): The directory to store the resampled audio files. @@ -117,28 +117,28 @@ def process_dataset_entry(self, data_entry): class ReadTxtLines(BaseParallelProcessor): """ - A class for reading text lines from a file and updating the dataset. + Processor for reading text lines from a file and updating the manifest. Args: - - input_field (str): The field containing the file path in the dataset. - - output_field (str): The field to store the read text lines in the dataset. + - source_filepath (str): The field containing the file path in the manifest. + - text_key (str): The field to store the read text lines in the manifest. - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. Methods: - - process_dataset_entry(data_entry): Processes a single dataset entry, reads text lines from the specified file, and updates the dataset. + - process_dataset_entry(data_entry): Processes a single dataset entry, reads text lines from the specified file, and updates the manifest. Note: - - This class inherits from the `BaseParallelProcessor` class and extends its functionality to read text lines from a file and update the dataset. + - This class inherits from the `BaseParallelProcessor` class and extends its functionality to read text lines from a file and update the manifest. """ def __init__( self, - input_field: str, - output_field: str, + source_filepath: str, + text_key: str, **kwargs, ): super().__init__(**kwargs) - self.input_field = input_field - self.output_field = output_field + self.input_field = source_filepath + self.output_field = text_key def process_dataset_entry(self, data_entry): fname = data_entry[self.input_field] @@ -155,11 +155,10 @@ def process_dataset_entry(self, data_entry): class SplitLineBySentence(BaseParallelProcessor): """ - A class for splitting lines of text into sentences based on a specified pattern. + Processor for splitting lines of text into sentences based on a specified pattern. Args: - - input_field (str): The field containing the input text lines in the dataset. - - output_field (str): The field to store the output sentences in the dataset. + - text_key (str): The field containing the input text lines in the dataset. - end_pattern (str): The regular expression pattern to identify sentence boundaries. - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. @@ -171,18 +170,16 @@ class SplitLineBySentence(BaseParallelProcessor): """ def __init__( self, - input_field: str, - output_field: str, + text_key: str, end_pattern: str, **kwargs, ): super().__init__(**kwargs) - self.input_field = input_field - self.output_field = output_field + self.text_key = text_key self.pattern = re.compile(end_pattern) def process_dataset_entry(self, data_entry): - line = data_entry[self.input_field] + line = data_entry[self.text_key] data_list = [] start = 0 ends = [m.start() for m in self.pattern.finditer(line)] @@ -191,21 +188,21 @@ def process_dataset_entry(self, data_entry): sent = line[start:end+1].strip() # if sent and sent[0].isupper(): data = data_entry.copy() - data[self.output_field] = sent + data[self.text_key] = sent data_list.append(DataEntry(data=data)) start = end+1 if start Date: Mon, 4 Dec 2023 22:36:35 -0800 Subject: [PATCH 044/115] paths Signed-off-by: Nikolay Karpov --- dataset_configs/armenian/audio_books.yaml | 6 +++--- dataset_configs/armenian/mcv.yaml | 2 +- dataset_configs/armenian/text.yaml | 2 +- .../modify_manifest/speech_recognition.py | 13 +++++++------ 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/dataset_configs/armenian/audio_books.yaml b/dataset_configs/armenian/audio_books.yaml index 710806e6..64a935c5 100644 --- a/dataset_configs/armenian/audio_books.yaml +++ b/dataset_configs/armenian/audio_books.yaml @@ -1,5 +1,5 @@ processors_to_run: "0:" -workspace_dir: /path/to/your/audio/books #/mnt/ssd8/arm +workspace_dir: /path/to/your/audio/books processors: - _target_: sdp.processors.CreateInitialManifestByExt @@ -22,7 +22,7 @@ processors: duration_field: duration output_manifest_file: ${workspace_dir}/manifest2.json - - _target_: sdp.processors.ASR_Whisper + - _target_: sdp.processors.ASRWhisper output_manifest_file: ${workspace_dir}/manifest3.json pretrained_model: "large-v2" output_text_field: text @@ -53,7 +53,7 @@ processors: - {"pattern": 'թարգմանություն', "repl": "թարգմանությունը"} - {"pattern": 'արտաշ է սեմինի', "repl": "Արտաշես Էմինի"} # double space to single space - - {"pattern": "\s+", "repl": " "} + - {"pattern": "\\s+", "repl": " "} test_cases: - {input: {text: "Գրիմ եղբայրներ, անտարի թնակը, Ռուսերենիս թարգմանեց, ամալիահ Ուկասյանը."}, output: {text: "Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}} - {input: {text: "Էտկարպո, Մատնիչ սիրտը, թարգմանություն արտաշ է սեմինի."}, output: {text: "Էդգար Պո, Մատնիչ սիրտը, թարգմանությունը Արտաշես Էմինի."}} diff --git a/dataset_configs/armenian/mcv.yaml b/dataset_configs/armenian/mcv.yaml index 6dbf0e58..2044f0bd 100644 --- a/dataset_configs/armenian/mcv.yaml +++ b/dataset_configs/armenian/mcv.yaml @@ -1,5 +1,5 @@ processors_to_run: "0:" -workspace_dir: /path/to/your/mcv/files #/mnt/ssd8/arm/mcv +workspace_dir: /path/to/your/mcv/files processors: - _target_: sdp.processors.CreateInitialManifestMCV # conda install -c conda-forge -y sox libvorbis diff --git a/dataset_configs/armenian/text.yaml b/dataset_configs/armenian/text.yaml index e1e5e3f7..6f970e02 100644 --- a/dataset_configs/armenian/text.yaml +++ b/dataset_configs/armenian/text.yaml @@ -3,7 +3,7 @@ workspace_dir: /path/to/your/txt/files processors: - _target_: sdp.processors.CreateInitialManifestByExt - raw_data_dir: /home/nkarpov/workspace/arm_docs_old/arm_docs #/home/nkarpov/workspace/arm_docs + raw_data_dir: ${workspace_dir}/arm_docs extension: txt output_field: source_filepath output_manifest_file: ${workspace_dir}/manifest0.json diff --git a/sdp/processors/modify_manifest/speech_recognition.py b/sdp/processors/modify_manifest/speech_recognition.py index d60a1ede..9d55a4a7 100644 --- a/sdp/processors/modify_manifest/speech_recognition.py +++ b/sdp/processors/modify_manifest/speech_recognition.py @@ -25,7 +25,7 @@ def __init__( super().__init__(**kwargs) import torch import whisper # pip install -U openai-whisper - + self.whisper = whisper self.pretrained_model = pretrained_model self.output_text_field = output_text_field self.device = device @@ -50,17 +50,17 @@ def process(self): f.write(json.dumps(item, ensure_ascii=False) + '\n') def whisper_infer(self, audio_path): - audio = whisper.load_audio(audio_path) + audio = self.whisper.load_audio(audio_path) - audio = whisper.pad_or_trim(audio) - mel = whisper.log_mel_spectrogram(audio) + audio = self.whisper.pad_or_trim(audio) + mel = self.whisper.log_mel_spectrogram(audio) mel = mel.to(self.device) _, probs = self.model.detect_language(mel) lang = max(probs, key=probs.get) - options = whisper.DecodingOptions() - result = whisper.decode(self.model, mel, options) + options = self.whisper.DecodingOptions() + result = self.whisper.decode(self.model, mel, options) return result.text, lang class ASRTransformer(BaseProcessor): @@ -82,6 +82,7 @@ def __init__( **kwargs, ): super().__init__(**kwargs) + import torch from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline self.pretrained_model = pretrained_model From 460cbbb577fa90b550bfe46da76c223a94acb738 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Mon, 4 Dec 2023 22:42:32 -0800 Subject: [PATCH 045/115] rename Signed-off-by: Nikolay Karpov --- .../{modify_manifest => huggingface}/speech_recognition.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename sdp/processors/{modify_manifest => huggingface}/speech_recognition.py (100%) diff --git a/sdp/processors/modify_manifest/speech_recognition.py b/sdp/processors/huggingface/speech_recognition.py similarity index 100% rename from sdp/processors/modify_manifest/speech_recognition.py rename to sdp/processors/huggingface/speech_recognition.py From f3cebd2facc83d31f03a351ea8b1182edbf75070 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Mon, 4 Dec 2023 22:50:40 -0800 Subject: [PATCH 046/115] import Signed-off-by: Nikolay Karpov --- sdp/processors/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py index 17249df0..d470eee9 100644 --- a/sdp/processors/__init__.py +++ b/sdp/processors/__init__.py @@ -74,7 +74,7 @@ from sdp.processors.nemo.asr_inference import ASRInference from sdp.processors.nemo.pc_inference import PCInference -from sdp.processors.modify_manifest.speech_recognition import ( +from sdp.processors.huggingface.speech_recognition import ( ASRTransformer, ASRWhisper, ) From 9a8d4f2df6ef9f2cd3a953202fd77ccb23a03b63 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Tue, 5 Dec 2023 05:14:50 -0800 Subject: [PATCH 047/115] docs Signed-off-by: Nikolay Karpov --- dataset_configs/armenian/text.yaml | 6 +++--- docs/src/sdp/api.rst | 14 ++++++++++++++ sdp/processors/__init__.py | 2 +- sdp/processors/modify_manifest/data_to_data.py | 14 +++++++------- 4 files changed, 25 insertions(+), 11 deletions(-) diff --git a/dataset_configs/armenian/text.yaml b/dataset_configs/armenian/text.yaml index 6f970e02..7c76d226 100644 --- a/dataset_configs/armenian/text.yaml +++ b/dataset_configs/armenian/text.yaml @@ -62,11 +62,11 @@ processors: - {input: {text: "test тест Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}, output: null} - {input: {text: "Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}, output: {text: "Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}} - - _target_: sdp.processors.NumWords + - _target_: sdp.processors.CountNumWords output_manifest_file: ${workspace_dir}/manifest7.json alphabet: "ԱԲԳԴԵԶԷԸԹԺԻԼԽԾԿՀՁՂՃՄՅՆՇՈՉՊՋՌՍՎՏՐՑՒՓՔՕՖՈՒԵ աբգդեզէըթժիլխծկհձղճմյնշոչպջռսվտրցւփքօֆուև" - input_field: text - output_field: num_words + text_key: text + num_words_key: num_words - _target_: sdp.processors.PreserveByThreshold output_manifest_file: ${workspace_dir}/manifest8.json diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst index 806bf7ff..3dfd37aa 100644 --- a/docs/src/sdp/api.rst +++ b/docs/src/sdp/api.rst @@ -200,6 +200,20 @@ Miscellaneous .. autodata:: sdp.processors.KeepOnlySpecifiedFields :annotation: +.. autodata:: sdp.processors.GetAudioDuration + :annotation: + +.. autodata:: sdp.processors.FfmpegConvert + :annotation: + +.. autodata:: sdp.processors.ReadTxtLines + :annotation: + +.. autodata:: sdp.processors.SplitLineBySentence + :annotation: + +.. autodata:: sdp.processors.CountNumWords + :annotation: .. _sdp-base-classes: diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py index d470eee9..a0198f2d 100644 --- a/sdp/processors/__init__.py +++ b/sdp/processors/__init__.py @@ -46,7 +46,7 @@ FfmpegConvert, ReadTxtLines, SplitLineBySentence, - NumWords, + CountNumWords, InsIfASRInsertion, SubIfASRSubstitution, SubMakeLowercase, diff --git a/sdp/processors/modify_manifest/data_to_data.py b/sdp/processors/modify_manifest/data_to_data.py index 3d8b0edd..2893e9d2 100644 --- a/sdp/processors/modify_manifest/data_to_data.py +++ b/sdp/processors/modify_manifest/data_to_data.py @@ -200,13 +200,13 @@ def process_dataset_entry(self, data_entry): return data_list -class NumWords(BaseParallelProcessor): +class CountNumWords(BaseParallelProcessor): """ Processor for counting the number of words in a text and updating the dataset. Args: - - input_field (str): The field containing the input text in the dataset. - - output_field (str): The field to store the number of words in the dataset. + - text_key (str): The field containing the input text in the dataset. + - num_words_key (str): The field to store the number of words in the dataset. - alphabet (str): The alphabet to be used for word tokenization. - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. @@ -218,14 +218,14 @@ class NumWords(BaseParallelProcessor): """ def __init__( self, - input_field: str, - output_field: str, + text_key: str, + num_words_key: str, alphabet: str, **kwargs, ): super().__init__(**kwargs) - self.input_field = input_field - self.output_field = output_field + self.input_field = text_key + self.output_field = num_words_key self.pattern = re.compile("[^"+alphabet+"]") def process_dataset_entry(self, data_entry): From c3ba8c9843637a07751a397820454e36728776fd Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Tue, 5 Dec 2023 23:55:38 -0800 Subject: [PATCH 048/115] subprocess Signed-off-by: Nikolay Karpov --- docs/src/sdp/api.rst | 3 + sdp/processors/__init__.py | 1 + .../modify_manifest/data_to_data.py | 63 +++++++++++++++++++ 3 files changed, 67 insertions(+) diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst index 3dfd37aa..1e3f6749 100644 --- a/docs/src/sdp/api.rst +++ b/docs/src/sdp/api.rst @@ -215,6 +215,9 @@ Miscellaneous .. autodata:: sdp.processors.CountNumWords :annotation: +.. autodata:: sdp.processors.Subprocess + :annotation: + .. _sdp-base-classes: Base classes diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py index a0198f2d..cc09db27 100644 --- a/sdp/processors/__init__.py +++ b/sdp/processors/__init__.py @@ -42,6 +42,7 @@ KeepOnlySpecifiedFields, ) from sdp.processors.modify_manifest.data_to_data import ( + Subprocess, GetAudioDuration, FfmpegConvert, ReadTxtLines, diff --git a/sdp/processors/modify_manifest/data_to_data.py b/sdp/processors/modify_manifest/data_to_data.py index 2893e9d2..e3585306 100644 --- a/sdp/processors/modify_manifest/data_to_data.py +++ b/sdp/processors/modify_manifest/data_to_data.py @@ -15,6 +15,7 @@ import collections import re import os +import subprocess from typing import Dict, List import soundfile as sf @@ -25,6 +26,68 @@ from sdp.utils.get_diff import get_diff_with_subs_grouped +class Subprocess(BaseProcessor): + """ + Processor for handling subprocess execution with additional features for managing input and output manifests. + + Parameters: + - cmd (str): The command to be executed as a subprocess. + - input_manifest_arg (str, optional): The argument specifying the input manifest. Defaults to an empty string. + - output_manifest_arg (str, optional): The argument specifying the output manifest. Defaults to an empty string. + - arg_separator (str, optional): The separator used between argument and value. Defaults to "=". + - **kwargs: Additional keyword arguments to be passed to the base class. + + Methods: + - process(): Executes the subprocess, handling input and output manifest arguments and ensuring they are not included in the command line. + + Example: + ```yaml + - _target_: sdp.processors.datasets.commoncrawl.Subprocess + output_manifest_file: /workspace/manifest.json + input_manifest_arg: "--manifest" + output_manifest_arg: "--output_filename" + arg_separator: "=" + cmd: "python /workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \ + --language=en --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache --overwrite_cache \ + --whitelist=/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv" + ``` + + """ + + def __init__( + self, + cmd: str, + input_manifest_arg: str = "", + output_manifest_arg: str = "", + arg_separator: str = "=", + **kwargs, + ): + super().__init__(**kwargs) + self.input_manifest_arg = input_manifest_arg + self.output_manifest_arg = output_manifest_arg + self.arg_separator = arg_separator + self.cmd = cmd + + def process(self): + os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True) + if self.cmd.find(self.input_manifest_file) != -1 or self.cmd.find(self.output_manifest_file) != -1: + logger.error("input_manifest_file "+self.input_manifest_file+" and output_manifest_file "+self.output_manifest_file+" should be exluded from cmd line!") + raise ValueError + process_args = [x for x in self.cmd.split(" ") if x] + if self.arg_separator == " ": + if self.input_manifest_arg: + process_args.extend([self.input_manifest_arg, self.input_manifest_file]) + if self.output_manifest_arg: + process_args.extend([self.output_manifest_arg, self.output_manifest_file]) + else: + if self.input_manifest_arg: + process_args.extend([self.input_manifest_arg + self.arg_separator + self.input_manifest_file]) + if self.output_manifest_arg: + process_args.extend([self.output_manifest_arg + self.arg_separator + self.output_manifest_file]) + + subprocess.run(process_args) + + class GetAudioDuration(BaseParallelProcessor): """ Processor to count audio duration using audio file path from input_field From 21005a2bec0cacf103061c003528bcd01cb3417a Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Sun, 10 Dec 2023 21:31:59 -0800 Subject: [PATCH 049/115] Subprocess Signed-off-by: Nikolay Karpov --- sdp/processors/__init__.py | 2 +- sdp/processors/modify_manifest/common.py | 61 +++++++++++++++++ .../modify_manifest/data_to_data.py | 65 +------------------ 3 files changed, 63 insertions(+), 65 deletions(-) diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py index cc09db27..fa1eacc5 100644 --- a/sdp/processors/__init__.py +++ b/sdp/processors/__init__.py @@ -32,6 +32,7 @@ NormalizeFromNonPCTextVoxpopuli, ) from sdp.processors.modify_manifest.common import ( + Subprocess, AddConstantFields, ChangeToRelativePath, CombineSources, @@ -42,7 +43,6 @@ KeepOnlySpecifiedFields, ) from sdp.processors.modify_manifest.data_to_data import ( - Subprocess, GetAudioDuration, FfmpegConvert, ReadTxtLines, diff --git a/sdp/processors/modify_manifest/common.py b/sdp/processors/modify_manifest/common.py index 40947b07..7182066d 100644 --- a/sdp/processors/modify_manifest/common.py +++ b/sdp/processors/modify_manifest/common.py @@ -1,5 +1,6 @@ import json import os +import subprocess from typing import Dict, List from tqdm import tqdm @@ -10,6 +11,66 @@ DataEntry, ) +class Subprocess(BaseProcessor): + """ + Processor for handling subprocess execution with additional features for managing input and output manifests. + + Parameters: + - cmd (str): The command to be executed as a subprocess. + - input_manifest_arg (str, optional): The argument specifying the input manifest. Defaults to an empty string. + - output_manifest_arg (str, optional): The argument specifying the output manifest. Defaults to an empty string. + - arg_separator (str, optional): The separator used between argument and value. Defaults to "=". + - **kwargs: Additional keyword arguments to be passed to the base class. + + Methods: + - process(): Executes the subprocess, handling input and output manifest arguments and ensuring they are not included in the command line. + + Example: + ```yaml + - _target_: sdp.processors.datasets.commoncrawl.Subprocess + output_manifest_file: /workspace/manifest.json + input_manifest_arg: "--manifest" + output_manifest_arg: "--output_filename" + arg_separator: "=" + cmd: "python /workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \ + --language=en --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache --overwrite_cache \ + --whitelist=/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv" + ``` + + """ + def __init__( + self, + cmd: str, + input_manifest_arg: str = "", + output_manifest_arg: str = "", + arg_separator: str = "=", + **kwargs, + ): + super().__init__(**kwargs) + self.input_manifest_arg = input_manifest_arg + self.output_manifest_arg = output_manifest_arg + self.arg_separator = arg_separator + self.cmd = cmd + + def process(self): + os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True) + if self.cmd.find(self.input_manifest_file) != -1 or self.cmd.find(self.output_manifest_file) != -1: + logger.error("input_manifest_file "+self.input_manifest_file+" and output_manifest_file "+self.output_manifest_file+" should be exluded from cmd line!") + raise ValueError + process_args = [x for x in self.cmd.split(" ") if x] + if self.arg_separator == " ": + if self.input_manifest_arg: + process_args.extend([self.input_manifest_arg, self.input_manifest_file]) + if self.output_manifest_arg: + process_args.extend([self.output_manifest_arg, self.output_manifest_file]) + else: + if self.input_manifest_arg: + process_args.extend([self.input_manifest_arg + self.arg_separator + self.input_manifest_file]) + if self.output_manifest_arg: + process_args.extend([self.output_manifest_arg + self.arg_separator + self.output_manifest_file]) + + subprocess.run(process_args) + class CombineSources(BaseParallelProcessor): """Can be used to create a single field from two alternative sources. diff --git a/sdp/processors/modify_manifest/data_to_data.py b/sdp/processors/modify_manifest/data_to_data.py index e3585306..abd9ef11 100644 --- a/sdp/processors/modify_manifest/data_to_data.py +++ b/sdp/processors/modify_manifest/data_to_data.py @@ -15,7 +15,6 @@ import collections import re import os -import subprocess from typing import Dict, List import soundfile as sf @@ -25,68 +24,6 @@ from sdp.utils.edit_spaces import add_start_end_spaces, remove_extra_spaces from sdp.utils.get_diff import get_diff_with_subs_grouped - -class Subprocess(BaseProcessor): - """ - Processor for handling subprocess execution with additional features for managing input and output manifests. - - Parameters: - - cmd (str): The command to be executed as a subprocess. - - input_manifest_arg (str, optional): The argument specifying the input manifest. Defaults to an empty string. - - output_manifest_arg (str, optional): The argument specifying the output manifest. Defaults to an empty string. - - arg_separator (str, optional): The separator used between argument and value. Defaults to "=". - - **kwargs: Additional keyword arguments to be passed to the base class. - - Methods: - - process(): Executes the subprocess, handling input and output manifest arguments and ensuring they are not included in the command line. - - Example: - ```yaml - - _target_: sdp.processors.datasets.commoncrawl.Subprocess - output_manifest_file: /workspace/manifest.json - input_manifest_arg: "--manifest" - output_manifest_arg: "--output_filename" - arg_separator: "=" - cmd: "python /workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \ - --language=en --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache --overwrite_cache \ - --whitelist=/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv" - ``` - - """ - - def __init__( - self, - cmd: str, - input_manifest_arg: str = "", - output_manifest_arg: str = "", - arg_separator: str = "=", - **kwargs, - ): - super().__init__(**kwargs) - self.input_manifest_arg = input_manifest_arg - self.output_manifest_arg = output_manifest_arg - self.arg_separator = arg_separator - self.cmd = cmd - - def process(self): - os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True) - if self.cmd.find(self.input_manifest_file) != -1 or self.cmd.find(self.output_manifest_file) != -1: - logger.error("input_manifest_file "+self.input_manifest_file+" and output_manifest_file "+self.output_manifest_file+" should be exluded from cmd line!") - raise ValueError - process_args = [x for x in self.cmd.split(" ") if x] - if self.arg_separator == " ": - if self.input_manifest_arg: - process_args.extend([self.input_manifest_arg, self.input_manifest_file]) - if self.output_manifest_arg: - process_args.extend([self.output_manifest_arg, self.output_manifest_file]) - else: - if self.input_manifest_arg: - process_args.extend([self.input_manifest_arg + self.arg_separator + self.input_manifest_file]) - if self.output_manifest_arg: - process_args.extend([self.output_manifest_arg + self.arg_separator + self.output_manifest_file]) - - subprocess.run(process_args) - class GetAudioDuration(BaseParallelProcessor): """ @@ -294,7 +231,7 @@ def __init__( def process_dataset_entry(self, data_entry): text = data_entry[self.input_field] cleaned_string = self.pattern.sub('', text).strip() - cleaned_string = re.sub('\s+', ' ', cleaned_string).strip() + cleaned_string = re.sub('\\s+', ' ', cleaned_string).strip() words = cleaned_string.split() num_words = len(words) data_entry[self.output_field] = num_words From 999590953bc4d4826628fba41632b637f68ecf07 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Mon, 11 Dec 2023 00:16:52 -0800 Subject: [PATCH 050/115] fix docs Signed-off-by: Nikolay Karpov --- docs/src/sdp/api.rst | 23 +++++---- sdp/processors/modify_manifest/common.py | 19 ++++--- .../modify_manifest/data_to_data.py | 50 +++++++++---------- 3 files changed, 48 insertions(+), 44 deletions(-) diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst index 1e3f6749..0a3997a1 100644 --- a/docs/src/sdp/api.rst +++ b/docs/src/sdp/api.rst @@ -78,6 +78,11 @@ used in the downstream processing for additional enhancement or filtering. .. autodata:: sdp.processors.PCInference :annotation: +.. autodata:: sdp.processors.ASRWhisper + :annotation: + +.. autodata:: sdp.processors.ASRTransformer + :annotation: Text-only processors #################### @@ -87,6 +92,9 @@ Text-only processors ``text_key`` (defaults to "text") to control which field is used for modifications/filtering. +.. autodata:: sdp.processors.ReadTxtLines + :annotation: + Data modifications '''''''''''''''''' @@ -102,6 +110,12 @@ Data modifications .. autodata:: sdp.processors.MakeLettersUppercaseAfterPeriod :annotation: +.. autodata:: sdp.processors.SplitLineBySentence + :annotation: + +.. autodata:: sdp.processors.CountNumWords + :annotation: + Data filtering '''''''''''''' @@ -206,15 +220,6 @@ Miscellaneous .. autodata:: sdp.processors.FfmpegConvert :annotation: -.. autodata:: sdp.processors.ReadTxtLines - :annotation: - -.. autodata:: sdp.processors.SplitLineBySentence - :annotation: - -.. autodata:: sdp.processors.CountNumWords - :annotation: - .. autodata:: sdp.processors.Subprocess :annotation: diff --git a/sdp/processors/modify_manifest/common.py b/sdp/processors/modify_manifest/common.py index 7182066d..73696dd3 100644 --- a/sdp/processors/modify_manifest/common.py +++ b/sdp/processors/modify_manifest/common.py @@ -16,18 +16,18 @@ class Subprocess(BaseProcessor): Processor for handling subprocess execution with additional features for managing input and output manifests. Parameters: - - cmd (str): The command to be executed as a subprocess. - - input_manifest_arg (str, optional): The argument specifying the input manifest. Defaults to an empty string. - - output_manifest_arg (str, optional): The argument specifying the output manifest. Defaults to an empty string. - - arg_separator (str, optional): The separator used between argument and value. Defaults to "=". - - **kwargs: Additional keyword arguments to be passed to the base class. + cmd (str): The command to be executed as a subprocess. + input_manifest_arg (str, optional): The argument specifying the input manifest. Defaults to an empty string. + output_manifest_arg (str, optional): The argument specifying the output manifest. Defaults to an empty string. + arg_separator (str, optional): The separator used between argument and value. Defaults to "=". + **kwargs: Additional keyword arguments to be passed to the base class. Methods: - - process(): Executes the subprocess, handling input and output manifest arguments and ensuring they are not included in the command line. + process(): Executes the subprocess, handling input and output manifest arguments and ensuring they are not included in the command line. - Example: - ```yaml - - _target_: sdp.processors.datasets.commoncrawl.Subprocess + Example:: + + _target_: sdp.processors.datasets.commoncrawl.Subprocess output_manifest_file: /workspace/manifest.json input_manifest_arg: "--manifest" output_manifest_arg: "--output_filename" @@ -35,7 +35,6 @@ class Subprocess(BaseProcessor): cmd: "python /workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \ --language=en --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache --overwrite_cache \ --whitelist=/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv" - ``` """ def __init__( diff --git a/sdp/processors/modify_manifest/data_to_data.py b/sdp/processors/modify_manifest/data_to_data.py index abd9ef11..d9be39d6 100644 --- a/sdp/processors/modify_manifest/data_to_data.py +++ b/sdp/processors/modify_manifest/data_to_data.py @@ -61,19 +61,19 @@ class FfmpegConvert(BaseParallelProcessor): Processor for converting video files to audio using FFmpeg and updating the dataset with the path to the resampled audio. Args: - - resampled_audio_dir (str): The directory to store the resampled audio files. - - input_field (str): The field in the dataset representing the path to the input video files. - - output_field (str): The field to store the path to the resampled audio files in the dataset. - - key_field (str): The field in the dataset representing the unique key or identifier for each entry. - - target_samplerate (int, optional): The target sampling rate for the resampled audio. Defaults to 16000. - - target_nchannels (int, optional): The target number of channels for the resampled audio. Defaults to 1. - - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. + resampled_audio_dir (str): The directory to store the resampled audio files. + input_field (str): The field in the dataset representing the path to the input video files. + output_field (str): The field to store the path to the resampled audio files in the dataset. + key_field (str): The field in the dataset representing the unique key or identifier for each entry. + target_samplerate (int, optional): The target sampling rate for the resampled audio. Defaults to 16000. + target_nchannels (int, optional): The target number of channels for the resampled audio. Defaults to 1. + **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. Methods: - - process_dataset_entry(data_entry): Processes a single dataset entry, converts the input video to resampled audio, and updates the dataset. + process_dataset_entry(data_entry): Processes a single dataset entry, converts the input video to resampled audio, and updates the dataset. Note: - - This class inherits from the `BaseParallelProcessor` class and extends its functionality to convert video files to resampled audio using FFmpeg. + This class inherits from the `BaseParallelProcessor` class and extends its functionality to convert video files to resampled audio using FFmpeg. """ def __init__( self, @@ -120,15 +120,15 @@ class ReadTxtLines(BaseParallelProcessor): Processor for reading text lines from a file and updating the manifest. Args: - - source_filepath (str): The field containing the file path in the manifest. - - text_key (str): The field to store the read text lines in the manifest. - - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. + source_filepath (str): The field containing the file path in the manifest. + text_key (str): The field to store the read text lines in the manifest. + **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. Methods: - - process_dataset_entry(data_entry): Processes a single dataset entry, reads text lines from the specified file, and updates the manifest. + process_dataset_entry(data_entry): Processes a single dataset entry, reads text lines from the specified file, and updates the manifest. Note: - - This class inherits from the `BaseParallelProcessor` class and extends its functionality to read text lines from a file and update the manifest. + This class inherits from the `BaseParallelProcessor` class and extends its functionality to read text lines from a file and update the manifest. """ def __init__( self, @@ -158,15 +158,15 @@ class SplitLineBySentence(BaseParallelProcessor): Processor for splitting lines of text into sentences based on a specified pattern. Args: - - text_key (str): The field containing the input text lines in the dataset. - - end_pattern (str): The regular expression pattern to identify sentence boundaries. - - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. + text_key (str): The field containing the input text lines in the dataset. + end_pattern (str): The regular expression pattern to identify sentence boundaries. + **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. Methods: - - process_dataset_entry(data_entry): Processes a single dataset entry, splitting the input text lines into sentences based on the specified pattern, and updates the dataset. + process_dataset_entry(data_entry): Processes a single dataset entry, splitting the input text lines into sentences based on the specified pattern, and updates the dataset. Note: - - This class inherits from the `BaseParallelProcessor` class and extends its functionality to split lines of text into sentences based on a specified pattern. + This class inherits from the `BaseParallelProcessor` class and extends its functionality to split lines of text into sentences based on a specified pattern. """ def __init__( self, @@ -205,16 +205,16 @@ class CountNumWords(BaseParallelProcessor): Processor for counting the number of words in a text and updating the dataset. Args: - - text_key (str): The field containing the input text in the dataset. - - num_words_key (str): The field to store the number of words in the dataset. - - alphabet (str): The alphabet to be used for word tokenization. - - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. + text_key (str): The field containing the input text in the dataset. + num_words_key (str): The field to store the number of words in the dataset. + alphabet (str): The alphabet to be used for word tokenization. + **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. Methods: - - process_dataset_entry(data_entry): Processes a single dataset entry, counts the number of words, and updates the dataset. + process_dataset_entry(data_entry): Processes a single dataset entry, counts the number of words, and updates the dataset. Note: - - This class inherits from the `BaseParallelProcessor` class and extends its functionality to count the number of words in a text and update the dataset. + This class inherits from the `BaseParallelProcessor` class and extends its functionality to count the number of words in a text and update the dataset. """ def __init__( self, From 8cfdf39f94036851e0187169f088296dbf454bd6 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Mon, 11 Dec 2023 01:04:38 -0800 Subject: [PATCH 051/115] CreateInitialManifestByExt doc Signed-off-by: Nikolay Karpov --- docs/src/sdp/api.rst | 3 +++ .../modify_manifest/create_manifest.py | 16 ++++++++-------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst index 0a3997a1..69e2b061 100644 --- a/docs/src/sdp/api.rst +++ b/docs/src/sdp/api.rst @@ -223,6 +223,9 @@ Miscellaneous .. autodata:: sdp.processors.Subprocess :annotation: +.. autodata:: sdp.processors.CreateInitialManifestByExt + :annotation: + .. _sdp-base-classes: Base classes diff --git a/sdp/processors/modify_manifest/create_manifest.py b/sdp/processors/modify_manifest/create_manifest.py index 8d8fc954..ac3a30d7 100644 --- a/sdp/processors/modify_manifest/create_manifest.py +++ b/sdp/processors/modify_manifest/create_manifest.py @@ -7,18 +7,18 @@ class CreateInitialManifestByExt(BaseParallelProcessor): Processor for creating an initial dataset manifest by saving filepaths with a common extension to the field specified in output_field. Args: - - raw_data_dir (str): The directory containing image and text files to include in the initial dataset manifest. - - output_field (str): The field to store the paths to the files in the dataset. - - extension (str): The field stecify extention of the file in the dataset. - - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. + raw_data_dir (str): The directory containing image and text files to include in the initial dataset manifest. + output_field (str): The field to store the paths to the files in the dataset. + extension (str): The field stecify extention of the file in the dataset. + **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. Methods: - - prepare(): Creates the directory for saving the initial dataset manifest. - - read_manifest(): Reads the image and text files, extracts common keys, and creates a DataFrame with video, key, and text fields. - - process_dataset_entry(data_entry): Processes a single dataset entry, creating a DataEntry object with video, key, and text fields, and updates the dataset. + prepare(): Creates the directory for saving the initial dataset manifest. + read_manifest(): Reads the image and text files, extracts common keys, and creates a DataFrame with video, key, and text fields. + process_dataset_entry(data_entry): Processes a single dataset entry, creating a DataEntry object with video, key, and text fields, and updates the dataset. Note: - - This class inherits from the `BaseParallelProcessor` class and extends its functionality to create an initial dataset manifest from image and text files with common keys. + This class inherits from the `BaseParallelProcessor` class and extends its functionality to create an initial dataset manifest from image and text files with common keys. """ def __init__( From af7ca036803259721ca8a0595b3e79992b95c8f7 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Mon, 11 Dec 2023 04:45:20 -0800 Subject: [PATCH 052/115] drop_abs_path Signed-off-by: Nikolay Karpov --- dataset_configs/commoncrawl/big_de.yaml | 43 ++++++++++++------- dataset_configs/commoncrawl/big_en.yaml | 40 +++++++++++------ dataset_configs/commoncrawl/big_fr.yaml | 43 ++++++++++++------- dataset_configs/commoncrawl/big_pl.yaml | 34 ++++++++++----- .../datasets/commoncrawl/__init__.py | 2 +- .../datasets/commoncrawl/commoncrawl.py | 22 ++++++++++ 6 files changed, 127 insertions(+), 57 deletions(-) diff --git a/dataset_configs/commoncrawl/big_de.yaml b/dataset_configs/commoncrawl/big_de.yaml index 7686277b..ff555fd9 100644 --- a/dataset_configs/commoncrawl/big_de.yaml +++ b/dataset_configs/commoncrawl/big_de.yaml @@ -1,10 +1,11 @@ -processors_to_run: "0:" # ü ä ö ß Ä Ö Ü +processors_to_run: "0:" lang: de -workspace_dir: /mnt/md1/out/${lang} # /mnt/md0/common_crawl/cc_sdp/de +base_dir: /path/to/dataset/folder +workspace_dir: ${base_dir}/${lang} processors: - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue - input_manifest_file: /mnt/md1/out/manifest11.json + input_manifest_file: /path/to/dataset/folder/manifest11.json output_manifest_file: ${workspace_dir}/manifest0.json input_field: audio_lang target_value: ${lang} @@ -16,7 +17,7 @@ processors: - _target_: sdp.processors.ASRInference output_manifest_file: ${workspace_dir}/manifest2.json - pretrained_model: nvidia/stt_de_fastconformer_hybrid_large_pc + pretrained_model: nvidia/stt_${lang}_fastconformer_hybrid_large_pc batch_size: 64 - _target_: sdp.processors.DuplicateFields @@ -53,7 +54,7 @@ processors: - {"pattern": '„', "repl": '"'} - {"pattern": '®', "repl": ' '} - {"pattern": '@', "repl": " "} - - {"pattern": " ", "repl": " "} + - {"pattern": "\\s+", "repl": " "} - _target_: sdp.processors.DropHighLowWordrate output_manifest_file: ${workspace_dir}/manifest5.json @@ -72,9 +73,8 @@ processors: output_manifest_arg: "--output_filename" arg_separator: "=" cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \ - --language=de --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache \ - --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/de/data/whitelist.tsv" - # --overwrite_cache + --language=${lang} --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache \ + --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/${lang}/data/whitelist.tsv" - _target_: sdp.processors.RenameFields output_manifest_file: ${workspace_dir}/manifest8.json @@ -90,7 +90,7 @@ processors: - {"pattern": '\$', "repl": ""} - {"pattern": "'", "repl": " "} - {"pattern": "[^a-zA-ZäöüÄÖÜßẞ.,?]", "repl": " "} - - {"pattern": ' ', "repl": " "} + - {"pattern": "\\s+", "repl": " "} - _target_: sdp.processors.DuplicateFields output_manifest_file: ${workspace_dir}/manifest10.json @@ -106,7 +106,7 @@ processors: regex_params_list: - {"pattern": "[\\?\\.]", "repl": " "} - {"pattern": ",", "repl": " "} - - {"pattern": " ", "repl": " "} + - {"pattern": "\\s+", "repl": " "} - _target_: sdp.processors.DropIfRegexMatch output_manifest_file: ${workspace_dir}/manifest13.json @@ -128,7 +128,7 @@ processors: regex_params_list: - {"pattern": "[\\?\\.]", "repl": " "} - {"pattern": ",", "repl": " "} - - {"pattern": " ", "repl": " "} + - {"pattern": "\\s+", "repl": " "} - _target_: sdp.processors.DropHighWER output_manifest_file: ${workspace_dir}/manifest17.json @@ -142,19 +142,30 @@ processors: pred_text_key: pred_text cer_threshold: 30 + - _target_: sdp.processors.KeepOnlySpecifiedFields + fields_to_keep: ["audio_filepath", "duration", "text_pc"] + + - _target_: sdp.processors.RenameFields + rename_fields: {"text_pc":"text"} + + - _target_: sdp.processors.datasets.commoncrawl.drop_abs_path + output_manifest_file: ${workspace_dir}/manifest_${lang}.json + path_key: audio_filepath + abs_path_to_drop: ${base_dir}/ + - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC - output_manifest_file: ${workspace_dir}/manifest19.json + output_manifest_file: ${workspace_dir}/manifest_${lang}_train.json lang: ${lang} data_split: train - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC - input_manifest_file: ${workspace_dir}/manifest18.json - output_manifest_file: ${workspace_dir}/manifest19_dev.json + input_manifest_file: ${workspace_dir}/manifest_${lang}.json + output_manifest_file: ${workspace_dir}/manifest_${lang}_dev.json lang: ${lang} data_split: dev - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC - input_manifest_file: ${workspace_dir}/manifest18.json - output_manifest_file: ${workspace_dir}/manifest19_test.json + input_manifest_file: ${workspace_dir}/manifest_${lang}.json + output_manifest_file: ${workspace_dir}/manifest_${lang}_test.json lang: ${lang} data_split: test \ No newline at end of file diff --git a/dataset_configs/commoncrawl/big_en.yaml b/dataset_configs/commoncrawl/big_en.yaml index 3e3a5ec6..7b27d561 100644 --- a/dataset_configs/commoncrawl/big_en.yaml +++ b/dataset_configs/commoncrawl/big_en.yaml @@ -1,21 +1,23 @@ processors_to_run: "0:" -workspace_dir: /mnt/md1/out/en #/mnt/md0/common_crawl/cc_sdp/en +lang: en +base_dir: /path/to/dataset/folder +workspace_dir: ${base_dir}/${lang} processors: - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue - input_manifest_file: /mnt/md1/out/manifest11.json + input_manifest_file: /path/to/dataset/folder/manifest11.json output_manifest_file: ${workspace_dir}/manifest0.json input_field: audio_lang - target_value: en + target_value: ${lang} - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue output_manifest_file: ${workspace_dir}/manifest1.json input_field: text_lang - target_value: en + target_value: ${lang} - _target_: sdp.processors.ASRInference output_manifest_file: ${workspace_dir}/manifest2.json - pretrained_model: nvidia/stt_en_fastconformer_hybrid_large_pc + pretrained_model: nvidia/stt_${lang}_fastconformer_hybrid_large_pc batch_size: 64 - _target_: sdp.processors.DropIfRegexMatch @@ -114,7 +116,7 @@ processors: arg_separator: "=" cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \ --language=en --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache --overwrite_cache \ - --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv" + --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/${lang}/data/whitelist/asr_with_pc.tsv" - _target_: sdp.processors.RenameFields output_manifest_file: ${workspace_dir}/manifest9.json @@ -227,7 +229,7 @@ processors: - _target_: sdp.processors.ASRInference output_manifest_file: ${workspace_dir}/manifest27.json - pretrained_model: nvidia/stt_en_fastconformer_hybrid_large_pc + pretrained_model: nvidia/stt_${lang}_fastconformer_hybrid_large_pc batch_size: 64 - _target_: sdp.processors.DuplicateFields @@ -244,7 +246,7 @@ processors: regex_params_list: - {"pattern": "[\\?\\.]", "repl": " "} - {"pattern": ",", "repl": " "} - - {"pattern": " ", "repl": " "} + - {"pattern": "\\s+", "repl": " "} - _target_: sdp.processors.DropHighWER output_manifest_file: ${workspace_dir}/manifest31.json @@ -296,19 +298,31 @@ processors: pred_text_key: text_asr_pred cer_threshold: 30 + - _target_: sdp.processors.KeepOnlySpecifiedFields + input_manifest_file: ${workspace_dir}/manifest20.json + fields_to_keep: ["audio_filepath", "duration", "text_pc"] + + - _target_: sdp.processors.RenameFields + rename_fields: {"text_pc":"text"} + + - _target_: sdp.processors.datasets.commoncrawl.drop_abs_path + output_manifest_file: ${workspace_dir}/manifest_${lang}.json + path_key: audio_filepath + abs_path_to_drop: ${base_dir}/ + - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC - output_manifest_file: ${workspace_dir}/manifest20_train.json + output_manifest_file: ${workspace_dir}/manifest_${lang}_train.json lang: ${lang} data_split: train - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC - input_manifest_file: ${workspace_dir}/manifest19.json - output_manifest_file: ${workspace_dir}/manifest20_dev.json + input_manifest_file: ${workspace_dir}/manifest_${lang}.json + output_manifest_file: ${workspace_dir}/manifest_${lang}_dev.json lang: ${lang} data_split: dev - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC - input_manifest_file: ${workspace_dir}/manifest19.json - output_manifest_file: ${workspace_dir}/manifest20_test.json + input_manifest_file: ${workspace_dir}/manifest_${lang}.json + output_manifest_file: ${workspace_dir}/manifest_${lang}_test.json lang: ${lang} data_split: test \ No newline at end of file diff --git a/dataset_configs/commoncrawl/big_fr.yaml b/dataset_configs/commoncrawl/big_fr.yaml index 1f81ab38..ff9b065c 100644 --- a/dataset_configs/commoncrawl/big_fr.yaml +++ b/dataset_configs/commoncrawl/big_fr.yaml @@ -1,15 +1,16 @@ processors_to_run: "0:" lang: fr -workspace_dir: /mnt/md1/out/${lang} #/mnt/md0/common_crawl/cc_sdp/fr +base_dir: /path/to/dataset/folder +workspace_dir: ${base_dir}/${lang} processors: - - _target_: sdp.processors.datasets.cc.cc.PreserveByValue - input_manifest_file: /mnt/md1/out/manifest11.json + - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue + input_manifest_file: ${base_dir}/manifest11.json output_manifest_file: ${workspace_dir}/manifest0.json input_field: audio_lang target_value: ${lang} - - _target_: sdp.processors.datasets.cc.cc.PreserveByValue + - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue output_manifest_file: ${workspace_dir}/manifest1.json input_field: text_lang target_value: ${lang} @@ -70,15 +71,14 @@ processors: regex_patterns: - "^\\s*$" - - _target_: sdp.processors.datasets.cc.cc.Subprocess + - _target_: sdp.processors.datasets.commoncrawl.Subprocess output_manifest_file: ${workspace_dir}/manifest8.json input_manifest_arg: "--manifest" output_manifest_arg: "--output_filename" arg_separator: "=" cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \ - --language=es --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache \ - --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/fr/data/whitelist.tsv" - # --overwrite_cache + --language=${lang} --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache \ + --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/${lang}/data/whitelist.tsv" - _target_: sdp.processors.RenameFields output_manifest_file: ${workspace_dir}/manifest9.json @@ -95,7 +95,7 @@ processors: - {"pattern": '\.{3}', "repl": '.'} - {"pattern": '\$', "repl": ""} - {"pattern": "[^a-zA-ZàâçéèêëîïôûùüÿæœÀÂÇÉÈÊËÎÏÔÛÙÜŸÆŒ.,?'-]", "repl": " "} - - {"pattern": ' ', "repl": " "} + - {"pattern": "\\s+", "repl": " "} - _target_: sdp.processors.DuplicateFields output_manifest_file: ${workspace_dir}/manifest11.json @@ -111,7 +111,7 @@ processors: regex_params_list: - {"pattern": "[\\?\\.]", "repl": " "} - {"pattern": ",", "repl": " "} - - {"pattern": " ", "repl": " "} + - {"pattern": "\\s+", "repl": " "} - _target_: sdp.processors.DropIfRegexMatch output_manifest_file: ${workspace_dir}/manifest14.json @@ -133,7 +133,7 @@ processors: regex_params_list: - {"pattern": "[\\?\\.]", "repl": " "} - {"pattern": ",", "repl": " "} - - {"pattern": " ", "repl": " "} + - {"pattern": "\\s+", "repl": " "} - _target_: sdp.processors.DropHighWER output_manifest_file: ${workspace_dir}/manifest18.json @@ -147,19 +147,30 @@ processors: pred_text_key: pred_text cer_threshold: 30 + - _target_: sdp.processors.KeepOnlySpecifiedFields + fields_to_keep: ["audio_filepath", "duration", "text_pc"] + + - _target_: sdp.processors.RenameFields + rename_fields: {"text_pc":"text"} + + - _target_: sdp.processors.datasets.commoncrawl.drop_abs_path + output_manifest_file: ${workspace_dir}/manifest_${lang}.json + path_key: audio_filepath + abs_path_to_drop: ${base_dir}/ + - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC - output_manifest_file: ${workspace_dir}/manifest20.json + output_manifest_file: ${workspace_dir}/manifest_${lang}_train.json lang: ${lang} data_split: train - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC - input_manifest_file: ${workspace_dir}/manifest19.json - output_manifest_file: ${workspace_dir}/manifest20_dev.json + input_manifest_file: ${workspace_dir}/manifest_${lang}.json + output_manifest_file: ${workspace_dir}/manifest_${lang}_dev.json lang: ${lang} data_split: dev - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC - input_manifest_file: ${workspace_dir}/manifest19.json - output_manifest_file: ${workspace_dir}/manifest20_test.json + input_manifest_file: ${workspace_dir}/manifest_${lang}.json + output_manifest_file: ${workspace_dir}/manifest_${lang}_test.json lang: ${lang} data_split: test \ No newline at end of file diff --git a/dataset_configs/commoncrawl/big_pl.yaml b/dataset_configs/commoncrawl/big_pl.yaml index ff2f7847..42e31d65 100644 --- a/dataset_configs/commoncrawl/big_pl.yaml +++ b/dataset_configs/commoncrawl/big_pl.yaml @@ -1,10 +1,11 @@ processors_to_run: "0:" lang: pl -workspace_dir: /mnt/md1/out/${lang} #/mnt/md0/common_crawl/cc_sdp/pl +base_dir: /path/to/dataset/folder +workspace_dir: ${base_dir}/${lang} processors: - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue - input_manifest_file: /mnt/md1/out/manifest11.json + input_manifest_file: ${base_dir}/manifest11.json output_manifest_file: ${workspace_dir}/manifest0.json input_field: audio_lang target_value: ${lang} @@ -50,7 +51,7 @@ processors: - {"pattern": '„', "repl": '"'} - {"pattern": '®', "repl": ' '} - {"pattern": '@', "repl": " "} - - {"pattern": " ", "repl": " "} + - {"pattern": "\\s+", "repl": " "} - _target_: sdp.processors.DropHighLowWordrate output_manifest_file: ${workspace_dir}/manifest5.json @@ -75,7 +76,7 @@ processors: - {"pattern": '\.{3}', "repl": '.'} - {"pattern": '\$', "repl": ""} - {"pattern": "[^a-pr-uwy-zA-PR-UWY-ZąćęłńóśźżĄĆĘŁŃÓŚŹŻ.,?]", "repl": " "} - - {"pattern": ' ', "repl": " "} + - {"pattern": "\\s+", "repl": " "} - _target_: sdp.processors.DuplicateFields output_manifest_file: ${workspace_dir}/manifest8.json @@ -91,7 +92,7 @@ processors: regex_params_list: - {"pattern": "[\\?\\.]", "repl": " "} - {"pattern": ",", "repl": " "} - - {"pattern": " ", "repl": " "} + - {"pattern": "\\s+", "repl": " "} - _target_: sdp.processors.DropIfRegexMatch output_manifest_file: ${workspace_dir}/manifest11.json @@ -113,7 +114,7 @@ processors: regex_params_list: - {"pattern": "[\\?\\.]", "repl": " "} - {"pattern": ",", "repl": " "} - - {"pattern": " ", "repl": " "} + - {"pattern": "\\s+", "repl": " "} - _target_: sdp.processors.DropHighWER output_manifest_file: ${workspace_dir}/manifest15.json @@ -127,19 +128,30 @@ processors: pred_text_key: pred_text cer_threshold: 30 + - _target_: sdp.processors.KeepOnlySpecifiedFields + fields_to_keep: ["audio_filepath", "duration", "text_pc"] + + - _target_: sdp.processors.RenameFields + rename_fields: {"text_pc":"text"} + + - _target_: sdp.processors.datasets.commoncrawl.drop_abs_path + output_manifest_file: ${workspace_dir}/manifest_${lang}.json + path_key: audio_filepath + abs_path_to_drop: ${base_dir}/ + - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC - output_manifest_file: ${workspace_dir}/manifest17.json + output_manifest_file: ${workspace_dir}/manifest_${lang}_train.json lang: ${lang} data_split: train - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC - input_manifest_file: ${workspace_dir}/manifest16.json - output_manifest_file: ${workspace_dir}/manifest17_dev.json + input_manifest_file: ${workspace_dir}/manifest_${lang}.json + output_manifest_file: ${workspace_dir}/manifest_${lang}_dev.json lang: ${lang} data_split: dev - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC - input_manifest_file: ${workspace_dir}/manifest16.json - output_manifest_file: ${workspace_dir}/manifest17_test.json + input_manifest_file: ${workspace_dir}/manifest_${lang}.json + output_manifest_file: ${workspace_dir}/manifest_${lang}_test.json lang: ${lang} data_split: test \ No newline at end of file diff --git a/sdp/processors/datasets/commoncrawl/__init__.py b/sdp/processors/datasets/commoncrawl/__init__.py index 55877778..15281419 100644 --- a/sdp/processors/datasets/commoncrawl/__init__.py +++ b/sdp/processors/datasets/commoncrawl/__init__.py @@ -16,4 +16,4 @@ Lang2Iso, SplitByVttSentence, SplitByVtt, AudioLid, TextLid, AllVttText, TxtToVtt, \ ReadParquet, CreateInitialManifestCC, FfmpegConvert, ASR_HF, AlignerSubprocess, \ SplitByAligner, JoinBy, EvalBandwidth, CreateInitialManifestExt, AudioDuration, \ - TrainDevTestSplitCC + TrainDevTestSplitCC, drop_abs_path diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py index 949c371a..2d47e4dc 100644 --- a/sdp/processors/datasets/commoncrawl/commoncrawl.py +++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py @@ -18,6 +18,28 @@ from sdp.processors.datasets.commoncrawl.harv_utils import ffmpeg_convert, txt2vtt, make_trans_list, get_vtt_text, text2lid, load_manifest, read_jsonl, write_jsonl, split_by_vtt_new, audio_duration from scipy.spatial import distance +class drop_abs_path(BaseParallelProcessor): + """ + Args: + path_key (str): where to get path to wav file. + abs_path_to_drop (str): string to drop from the bigining of path to wav file. + """ + def __init__( + self, + path_key: str, + abs_path_to_drop: str, + **kwargs, + ): + super().__init__(**kwargs) + self.path_key = path_key + self.abs_path_to_drop = abs_path_to_drop + + def process_dataset_entry(self, data_entry): + audio_filepath = data_entry[self.path_key] + data_entry[self.path_key]=audio_filepath[len(self.abs_path_to_drop):] + return [DataEntry(data=data_entry)] + + class TrainDevTestSplitCC(BaseParallelProcessor): """Custom train-dev-test split for CORAAL dataset. From c12b73286ee92f7753c5129089344ac2020b5f4e Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Tue, 12 Dec 2023 09:50:46 -0800 Subject: [PATCH 053/115] add lang Signed-off-by: Nikolay Karpov --- dataset_configs/commoncrawl/big_de.yaml | 9 ++++++--- dataset_configs/commoncrawl/big_en.yaml | 10 +++++++--- dataset_configs/commoncrawl/big_fr.yaml | 9 ++++++--- dataset_configs/commoncrawl/big_pl.yaml | 9 ++++++--- sdp/processors/datasets/commoncrawl/commoncrawl.py | 7 ++++--- 5 files changed, 29 insertions(+), 15 deletions(-) diff --git a/dataset_configs/commoncrawl/big_de.yaml b/dataset_configs/commoncrawl/big_de.yaml index ff555fd9..63e2256d 100644 --- a/dataset_configs/commoncrawl/big_de.yaml +++ b/dataset_configs/commoncrawl/big_de.yaml @@ -148,24 +148,27 @@ processors: - _target_: sdp.processors.RenameFields rename_fields: {"text_pc":"text"} + - _target_: sdp.processors.AddConstantFields + fields: {"lang": '${lang}'} + - _target_: sdp.processors.datasets.commoncrawl.drop_abs_path output_manifest_file: ${workspace_dir}/manifest_${lang}.json path_key: audio_filepath abs_path_to_drop: ${base_dir}/ - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC - output_manifest_file: ${workspace_dir}/manifest_${lang}_train.json + output_manifest_file: ${base_dir}/splited_manifests/${lang}_train.json lang: ${lang} data_split: train - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC input_manifest_file: ${workspace_dir}/manifest_${lang}.json - output_manifest_file: ${workspace_dir}/manifest_${lang}_dev.json + output_manifest_file: ${base_dir}/splited_manifests/${lang}_dev.json lang: ${lang} data_split: dev - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC input_manifest_file: ${workspace_dir}/manifest_${lang}.json - output_manifest_file: ${workspace_dir}/manifest_${lang}_test.json + output_manifest_file: ${base_dir}/splited_manifests/${lang}_test.json lang: ${lang} data_split: test \ No newline at end of file diff --git a/dataset_configs/commoncrawl/big_en.yaml b/dataset_configs/commoncrawl/big_en.yaml index 7b27d561..9a3e6fdc 100644 --- a/dataset_configs/commoncrawl/big_en.yaml +++ b/dataset_configs/commoncrawl/big_en.yaml @@ -305,24 +305,28 @@ processors: - _target_: sdp.processors.RenameFields rename_fields: {"text_pc":"text"} + - _target_: sdp.processors.AddConstantFields + fields: {"lang": '${lang}'} + - _target_: sdp.processors.datasets.commoncrawl.drop_abs_path output_manifest_file: ${workspace_dir}/manifest_${lang}.json path_key: audio_filepath abs_path_to_drop: ${base_dir}/ - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC - output_manifest_file: ${workspace_dir}/manifest_${lang}_train.json + input_manifest_file: ${workspace_dir}/manifest_${lang}.json + output_manifest_file: ${base_dir}/splited_manifests/${lang}_train.json lang: ${lang} data_split: train - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC input_manifest_file: ${workspace_dir}/manifest_${lang}.json - output_manifest_file: ${workspace_dir}/manifest_${lang}_dev.json + output_manifest_file: ${base_dir}/splited_manifests/${lang}_dev.json lang: ${lang} data_split: dev - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC input_manifest_file: ${workspace_dir}/manifest_${lang}.json - output_manifest_file: ${workspace_dir}/manifest_${lang}_test.json + output_manifest_file: ${base_dir}/splited_manifests/${lang}_test.json lang: ${lang} data_split: test \ No newline at end of file diff --git a/dataset_configs/commoncrawl/big_fr.yaml b/dataset_configs/commoncrawl/big_fr.yaml index ff9b065c..580beba4 100644 --- a/dataset_configs/commoncrawl/big_fr.yaml +++ b/dataset_configs/commoncrawl/big_fr.yaml @@ -153,24 +153,27 @@ processors: - _target_: sdp.processors.RenameFields rename_fields: {"text_pc":"text"} + - _target_: sdp.processors.AddConstantFields + fields: {"lang": '${lang}'} + - _target_: sdp.processors.datasets.commoncrawl.drop_abs_path output_manifest_file: ${workspace_dir}/manifest_${lang}.json path_key: audio_filepath abs_path_to_drop: ${base_dir}/ - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC - output_manifest_file: ${workspace_dir}/manifest_${lang}_train.json + output_manifest_file: ${base_dir}/splited_manifests/${lang}_train.json lang: ${lang} data_split: train - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC input_manifest_file: ${workspace_dir}/manifest_${lang}.json - output_manifest_file: ${workspace_dir}/manifest_${lang}_dev.json + output_manifest_file: ${base_dir}/splited_manifests/${lang}_dev.json lang: ${lang} data_split: dev - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC input_manifest_file: ${workspace_dir}/manifest_${lang}.json - output_manifest_file: ${workspace_dir}/manifest_${lang}_test.json + output_manifest_file: ${base_dir}/splited_manifests/${lang}_test.json lang: ${lang} data_split: test \ No newline at end of file diff --git a/dataset_configs/commoncrawl/big_pl.yaml b/dataset_configs/commoncrawl/big_pl.yaml index 42e31d65..e3318a32 100644 --- a/dataset_configs/commoncrawl/big_pl.yaml +++ b/dataset_configs/commoncrawl/big_pl.yaml @@ -134,24 +134,27 @@ processors: - _target_: sdp.processors.RenameFields rename_fields: {"text_pc":"text"} + - _target_: sdp.processors.AddConstantFields + fields: {"lang": '${lang}'} + - _target_: sdp.processors.datasets.commoncrawl.drop_abs_path output_manifest_file: ${workspace_dir}/manifest_${lang}.json path_key: audio_filepath abs_path_to_drop: ${base_dir}/ - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC - output_manifest_file: ${workspace_dir}/manifest_${lang}_train.json + output_manifest_file: ${base_dir}/splited_manifests/${lang}_train.json lang: ${lang} data_split: train - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC input_manifest_file: ${workspace_dir}/manifest_${lang}.json - output_manifest_file: ${workspace_dir}/manifest_${lang}_dev.json + output_manifest_file: ${base_dir}/splited_manifests/${lang}_dev.json lang: ${lang} data_split: dev - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC input_manifest_file: ${workspace_dir}/manifest_${lang}.json - output_manifest_file: ${workspace_dir}/manifest_${lang}_test.json + output_manifest_file: ${base_dir}/splited_manifests/${lang}_test.json lang: ${lang} data_split: test \ No newline at end of file diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py index c4301f1c..97710f52 100644 --- a/sdp/processors/datasets/commoncrawl/commoncrawl.py +++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py @@ -20,7 +20,9 @@ class drop_abs_path(BaseParallelProcessor): """ - Args: + Drop absolute path + + Args: path_key (str): where to get path to wav file. abs_path_to_drop (str): string to drop from the bigining of path to wav file. """ @@ -451,7 +453,7 @@ def process_dataset_entry(self, data_entry): return [DataEntry(data=data_entry)] class Subprocess(BaseProcessor): - """ + """ A class for handling subprocess execution with additional features for managing input and output manifests. Parameters: @@ -473,7 +475,6 @@ class Subprocess(BaseProcessor): Note: - The `BaseProcessor` class is assumed to be the base class, providing common functionality. """ - def __init__( self, cmd: str, From f310b01fe68c1996872e3847e842b7cf005054af Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Wed, 13 Dec 2023 21:09:55 -0800 Subject: [PATCH 054/115] deps Signed-off-by: Nikolay Karpov --- .github/workflows/doc-build.yml | 1 + .github/workflows/tests.yml | 1 + dataset_configs/armenian/audio_books.yaml | 2 +- dataset_configs/armenian/mcv.yaml | 4 +-- dataset_configs/armenian/text.yaml | 2 +- sdp/processors/langs/armenian.py | 26 +++++++++---------- .../modify_manifest/create_manifest.py | 2 +- 7 files changed, 20 insertions(+), 18 deletions(-) diff --git a/.github/workflows/doc-build.yml b/.github/workflows/doc-build.yml index 694cc7f8..a85181f5 100644 --- a/.github/workflows/doc-build.yml +++ b/.github/workflows/doc-build.yml @@ -33,6 +33,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip + pip install -r requirements/main.txt pip install -r requirements/docs.txt - name: Build docs with sphinx run: | diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index fe057b60..4208efe4 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -23,6 +23,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip + pip install -r requirements/main.txt pip install -r requirements/docs.txt # we are being quite strict here, but hopefully that will not be too inconvenient - name: Checking that documentation builds with no warnings and all links are working diff --git a/dataset_configs/armenian/audio_books.yaml b/dataset_configs/armenian/audio_books.yaml index 64a935c5..767e2829 100644 --- a/dataset_configs/armenian/audio_books.yaml +++ b/dataset_configs/armenian/audio_books.yaml @@ -1,5 +1,5 @@ processors_to_run: "0:" -workspace_dir: /path/to/your/audio/books +workspace_dir: ??? processors: - _target_: sdp.processors.CreateInitialManifestByExt diff --git a/dataset_configs/armenian/mcv.yaml b/dataset_configs/armenian/mcv.yaml index 2044f0bd..b8440386 100644 --- a/dataset_configs/armenian/mcv.yaml +++ b/dataset_configs/armenian/mcv.yaml @@ -1,9 +1,9 @@ processors_to_run: "0:" -workspace_dir: /path/to/your/mcv/files +workspace_dir: ??? processors: - _target_: sdp.processors.CreateInitialManifestMCV # conda install -c conda-forge -y sox libvorbis - raw_data_dir: /home/nkarpov/data/hy + raw_data_dir: ${workspace_dir} extract_archive_dir: ${workspace_dir}/row resampled_audio_dir: ${workspace_dir}/16k data_split: train diff --git a/dataset_configs/armenian/text.yaml b/dataset_configs/armenian/text.yaml index 7c76d226..f72b6a9a 100644 --- a/dataset_configs/armenian/text.yaml +++ b/dataset_configs/armenian/text.yaml @@ -1,5 +1,5 @@ processors_to_run: "0:" -workspace_dir: /path/to/your/txt/files +workspace_dir: ??? processors: - _target_: sdp.processors.CreateInitialManifestByExt diff --git a/sdp/processors/langs/armenian.py b/sdp/processors/langs/armenian.py index fede4669..fdf265bc 100644 --- a/sdp/processors/langs/armenian.py +++ b/sdp/processors/langs/armenian.py @@ -10,15 +10,15 @@ class GetSource(BaseParallelProcessor): Processor for extracting source information from file paths and updating the manifest. Args: - - source_filepath (str): The field containing the file path in the manifest. - - source_field (str): The field to store the extracted source information in the manifest. - - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. + source_filepath (str): The field containing the file path in the manifest. + source_field (str): The field to store the extracted source information in the manifest. + **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. Methods: - - process_dataset_entry(data_entry): Processes a single dataset entry, extracts source information, and updates the manifest. + process_dataset_entry(data_entry): Processes a single dataset entry, extracts source information, and updates the manifest. Note: - - This class inherits from the `BaseParallelProcessor` class and extends its functionality to extract source information from file paths and update the manifest. + This class inherits from the `BaseParallelProcessor` class and extends its functionality to extract source information from file paths and update the manifest. """ def __init__( self, @@ -42,13 +42,13 @@ class MakeTsv(BaseProcessor): Processor for converting a JSON manifest file to a TSV (Tab-Separated Values) file. Args: - - **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`. + **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`. Methods: - - process(): Reads the input JSON manifest file, converts it to a DataFrame, and saves it as a TSV file. + process(): Reads the input JSON manifest file, converts it to a DataFrame, and saves it as a TSV file. Note: - - This class inherits from the `BaseProcessor` class and provides functionality to convert a JSON manifest file to a TSV file. + This class inherits from the `BaseProcessor` class and provides functionality to convert a JSON manifest file to a TSV file. """ def __init__( self, @@ -65,15 +65,15 @@ class RandomTsvPart(BaseProcessor): Processor for creating a random subset of a TSV (Tab-Separated Values) file based on the specified fraction. Args: - - part (float): The fraction of the dataset to include in the random subset, should be in the range (0.0, 1.0). - - random_state (int): Seed for reproducibility when generating the random subset. - - **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`. + part (float): The fraction of the dataset to include in the random subset, should be in the range (0.0, 1.0). + random_state (int): Seed for reproducibility when generating the random subset. + **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`. Methods: - - process(): Reads the input TSV manifest file, creates a random subset based on the specified fraction, and saves it as a new TSV file. + process(): Reads the input TSV manifest file, creates a random subset based on the specified fraction, and saves it as a new TSV file. Note: - - This class inherits from the `BaseProcessor` class and provides functionality to create a random subset of a TSV file. + This class inherits from the `BaseProcessor` class and provides functionality to create a random subset of a TSV file. """ def __init__( self, diff --git a/sdp/processors/modify_manifest/create_manifest.py b/sdp/processors/modify_manifest/create_manifest.py index ac3a30d7..d12060e8 100644 --- a/sdp/processors/modify_manifest/create_manifest.py +++ b/sdp/processors/modify_manifest/create_manifest.py @@ -9,7 +9,7 @@ class CreateInitialManifestByExt(BaseParallelProcessor): Args: raw_data_dir (str): The directory containing image and text files to include in the initial dataset manifest. output_field (str): The field to store the paths to the files in the dataset. - extension (str): The field stecify extention of the file in the dataset. + extension (str): The field stecify extension of the file in the dataset. **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. Methods: From 7b7df73d2281da2ca4632f1ac536d9af44d7822d Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Thu, 14 Dec 2023 11:13:24 -0800 Subject: [PATCH 055/115] PreserveByValue Signed-off-by: Nikolay Karpov --- dataset_configs/armenian/text.yaml | 4 ++-- docs/src/sdp/api.rst | 3 +++ sdp/processors/__init__.py | 2 +- .../modify_manifest/data_to_dropbool.py | 23 +++++++++---------- 4 files changed, 17 insertions(+), 15 deletions(-) diff --git a/dataset_configs/armenian/text.yaml b/dataset_configs/armenian/text.yaml index f72b6a9a..f0598c54 100644 --- a/dataset_configs/armenian/text.yaml +++ b/dataset_configs/armenian/text.yaml @@ -68,13 +68,13 @@ processors: text_key: text num_words_key: num_words - - _target_: sdp.processors.PreserveByThreshold + - _target_: sdp.processors.PreserveByValue output_manifest_file: ${workspace_dir}/manifest8.json input_field: num_words target_value: 15 operator: le - - _target_: sdp.processors.PreserveByThreshold + - _target_: sdp.processors.PreserveByValue output_manifest_file: ${workspace_dir}/manifest9.json input_field: num_words target_value: 3 diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst index 69e2b061..27d71e51 100644 --- a/docs/src/sdp/api.rst +++ b/docs/src/sdp/api.rst @@ -157,6 +157,9 @@ Data modifications Data filtering '''''''''''''' +.. autodata:: sdp.processors.PreserveByValue + :annotation: + .. autodata:: sdp.processors.DropASRError :annotation: diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py index fa1eacc5..1c06a48e 100644 --- a/sdp/processors/__init__.py +++ b/sdp/processors/__init__.py @@ -54,7 +54,7 @@ SubRegex, ) from sdp.processors.modify_manifest.data_to_dropbool import ( - PreserveByThreshold, + PreserveByValue, DropASRError, DropASRErrorBeginningEnd, DropHighCER, diff --git a/sdp/processors/modify_manifest/data_to_dropbool.py b/sdp/processors/modify_manifest/data_to_dropbool.py index a1d77c61..b9c2c376 100644 --- a/sdp/processors/modify_manifest/data_to_dropbool.py +++ b/sdp/processors/modify_manifest/data_to_dropbool.py @@ -30,27 +30,26 @@ ) -class PreserveByThreshold(BaseParallelProcessor): +class PreserveByValue(BaseParallelProcessor): """ - A class for preserving dataset entries based on a specified condition involving a target value and an input field. + Processor for preserving dataset entries based on a specified condition involving a target value and an input field. Parameters: - - input_field (str): The field in the dataset entries to be evaluated. - - target_value (Union[int, str]): The value to compare with the input field. - - operator (str, optional): The operator to apply for comparison. Options: "lt" (less than), "le" (less than or equal to), - "eq" (equal to), "ne" (not equal to), "ge" (greater than or equal to), "gt" (greater than). Defaults to "eq". - - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. + input_field (str): The field in the dataset entries to be evaluated. + target_value (Union[int, str]): The value to compare with the input field. + operator (str, optional): The operator to apply for comparison. Options: "lt" (less than), "le" (less than or equal to), "eq" (equal to), "ne" (not equal to), "ge" (greater than or equal to), "gt" (greater than). Defaults to "eq". + **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. Attributes: - - input_field (str): The field in the dataset entries to be evaluated. - - target_value (Union[int, str]): The value to compare with the input field. - - operator (function): The operator function based on the specified operator. + input_field (str): The field in the dataset entries to be evaluated. + target_value (Union[int, str]): The value to compare with the input field. + operator (function): The operator function based on the specified operator. Methods: - - process_dataset_entry(data_entry): Processes a single dataset entry, preserving it based on the specified condition. + process_dataset_entry(data_entry): Processes a single dataset entry, preserving it based on the specified condition. Note: - - This class inherits from the `BaseParallelProcessor` class and extends its functionality to selectively preserve dataset entries. + This class inherits from the `BaseParallelProcessor` class and extends its functionality to selectively preserve dataset entries. """ def __init__( self, From a688b8a06ee756a5b2452f39438218c8be9e2c45 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Thu, 14 Dec 2023 11:21:56 -0800 Subject: [PATCH 056/115] GetSourceFolder Signed-off-by: Nikolay Karpov --- dataset_configs/armenian/text.yaml | 2 +- sdp/processors/langs/armenian.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dataset_configs/armenian/text.yaml b/dataset_configs/armenian/text.yaml index f0598c54..af5940c3 100644 --- a/dataset_configs/armenian/text.yaml +++ b/dataset_configs/armenian/text.yaml @@ -80,7 +80,7 @@ processors: target_value: 3 operator: ge - - _target_: sdp.processors.langs.armenian.GetSource + - _target_: sdp.processors.langs.armenian.GetSourceFolder output_manifest_file: ${workspace_dir}/manifest10.json source_filepath: source_filepath source_field: Source diff --git a/sdp/processors/langs/armenian.py b/sdp/processors/langs/armenian.py index fdf265bc..19f3470d 100644 --- a/sdp/processors/langs/armenian.py +++ b/sdp/processors/langs/armenian.py @@ -5,13 +5,13 @@ from sdp.utils.common import load_manifest -class GetSource(BaseParallelProcessor): +class GetSourceFolder(BaseParallelProcessor): """ - Processor for extracting source information from file paths and updating the manifest. + Processor for extracting source folder from file paths and updating the manifest. Args: source_filepath (str): The field containing the file path in the manifest. - source_field (str): The field to store the extracted source information in the manifest. + source_field (str): The field to store the extracted source folder in the manifest. **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. Methods: From c196b508854bc38bc918af2ef06e24d6e2dd52c0 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Thu, 14 Dec 2023 11:40:44 -0800 Subject: [PATCH 057/115] drop Attributes Signed-off-by: Nikolay Karpov --- sdp/processors/modify_manifest/data_to_dropbool.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/sdp/processors/modify_manifest/data_to_dropbool.py b/sdp/processors/modify_manifest/data_to_dropbool.py index b9c2c376..a6af6b5e 100644 --- a/sdp/processors/modify_manifest/data_to_dropbool.py +++ b/sdp/processors/modify_manifest/data_to_dropbool.py @@ -40,11 +40,6 @@ class PreserveByValue(BaseParallelProcessor): operator (str, optional): The operator to apply for comparison. Options: "lt" (less than), "le" (less than or equal to), "eq" (equal to), "ne" (not equal to), "ge" (greater than or equal to), "gt" (greater than). Defaults to "eq". **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. - Attributes: - input_field (str): The field in the dataset entries to be evaluated. - target_value (Union[int, str]): The value to compare with the input field. - operator (function): The operator function based on the specified operator. - Methods: process_dataset_entry(data_entry): Processes a single dataset entry, preserving it based on the specified condition. From 4f22ff2b0a3f1aa34c6d08dec770103b1501bc4e Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Thu, 14 Dec 2023 11:44:02 -0800 Subject: [PATCH 058/115] args Signed-off-by: Nikolay Karpov --- sdp/processors/modify_manifest/data_to_dropbool.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdp/processors/modify_manifest/data_to_dropbool.py b/sdp/processors/modify_manifest/data_to_dropbool.py index a6af6b5e..db1aad94 100644 --- a/sdp/processors/modify_manifest/data_to_dropbool.py +++ b/sdp/processors/modify_manifest/data_to_dropbool.py @@ -34,7 +34,7 @@ class PreserveByValue(BaseParallelProcessor): """ Processor for preserving dataset entries based on a specified condition involving a target value and an input field. - Parameters: + Args: input_field (str): The field in the dataset entries to be evaluated. target_value (Union[int, str]): The value to compare with the input field. operator (str, optional): The operator to apply for comparison. Options: "lt" (less than), "le" (less than or equal to), "eq" (equal to), "ne" (not equal to), "ge" (greater than or equal to), "gt" (greater than). Defaults to "eq". From 9a9831ccdaaf265f31ba7d03964bb758aa00bcac Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Thu, 14 Dec 2023 11:51:01 -0800 Subject: [PATCH 059/115] rm methods Signed-off-by: Nikolay Karpov --- sdp/processors/langs/armenian.py | 9 --------- sdp/processors/modify_manifest/common.py | 5 +---- sdp/processors/modify_manifest/create_manifest.py | 5 ----- sdp/processors/modify_manifest/data_to_data.py | 12 ------------ sdp/processors/modify_manifest/data_to_dropbool.py | 3 --- 5 files changed, 1 insertion(+), 33 deletions(-) diff --git a/sdp/processors/langs/armenian.py b/sdp/processors/langs/armenian.py index 19f3470d..2931217a 100644 --- a/sdp/processors/langs/armenian.py +++ b/sdp/processors/langs/armenian.py @@ -14,9 +14,6 @@ class GetSourceFolder(BaseParallelProcessor): source_field (str): The field to store the extracted source folder in the manifest. **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. - Methods: - process_dataset_entry(data_entry): Processes a single dataset entry, extracts source information, and updates the manifest. - Note: This class inherits from the `BaseParallelProcessor` class and extends its functionality to extract source information from file paths and update the manifest. """ @@ -44,9 +41,6 @@ class MakeTsv(BaseProcessor): Args: **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`. - Methods: - process(): Reads the input JSON manifest file, converts it to a DataFrame, and saves it as a TSV file. - Note: This class inherits from the `BaseProcessor` class and provides functionality to convert a JSON manifest file to a TSV file. """ @@ -69,9 +63,6 @@ class RandomTsvPart(BaseProcessor): random_state (int): Seed for reproducibility when generating the random subset. **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`. - Methods: - process(): Reads the input TSV manifest file, creates a random subset based on the specified fraction, and saves it as a new TSV file. - Note: This class inherits from the `BaseProcessor` class and provides functionality to create a random subset of a TSV file. """ diff --git a/sdp/processors/modify_manifest/common.py b/sdp/processors/modify_manifest/common.py index 73696dd3..2b57f15c 100644 --- a/sdp/processors/modify_manifest/common.py +++ b/sdp/processors/modify_manifest/common.py @@ -15,16 +15,13 @@ class Subprocess(BaseProcessor): """ Processor for handling subprocess execution with additional features for managing input and output manifests. - Parameters: + Args: cmd (str): The command to be executed as a subprocess. input_manifest_arg (str, optional): The argument specifying the input manifest. Defaults to an empty string. output_manifest_arg (str, optional): The argument specifying the output manifest. Defaults to an empty string. arg_separator (str, optional): The separator used between argument and value. Defaults to "=". **kwargs: Additional keyword arguments to be passed to the base class. - Methods: - process(): Executes the subprocess, handling input and output manifest arguments and ensuring they are not included in the command line. - Example:: _target_: sdp.processors.datasets.commoncrawl.Subprocess diff --git a/sdp/processors/modify_manifest/create_manifest.py b/sdp/processors/modify_manifest/create_manifest.py index d12060e8..ffbf6527 100644 --- a/sdp/processors/modify_manifest/create_manifest.py +++ b/sdp/processors/modify_manifest/create_manifest.py @@ -12,11 +12,6 @@ class CreateInitialManifestByExt(BaseParallelProcessor): extension (str): The field stecify extension of the file in the dataset. **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. - Methods: - prepare(): Creates the directory for saving the initial dataset manifest. - read_manifest(): Reads the image and text files, extracts common keys, and creates a DataFrame with video, key, and text fields. - process_dataset_entry(data_entry): Processes a single dataset entry, creating a DataEntry object with video, key, and text fields, and updates the dataset. - Note: This class inherits from the `BaseParallelProcessor` class and extends its functionality to create an initial dataset manifest from image and text files with common keys. """ diff --git a/sdp/processors/modify_manifest/data_to_data.py b/sdp/processors/modify_manifest/data_to_data.py index d9be39d6..faae7041 100644 --- a/sdp/processors/modify_manifest/data_to_data.py +++ b/sdp/processors/modify_manifest/data_to_data.py @@ -69,9 +69,6 @@ class FfmpegConvert(BaseParallelProcessor): target_nchannels (int, optional): The target number of channels for the resampled audio. Defaults to 1. **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. - Methods: - process_dataset_entry(data_entry): Processes a single dataset entry, converts the input video to resampled audio, and updates the dataset. - Note: This class inherits from the `BaseParallelProcessor` class and extends its functionality to convert video files to resampled audio using FFmpeg. """ @@ -124,9 +121,6 @@ class ReadTxtLines(BaseParallelProcessor): text_key (str): The field to store the read text lines in the manifest. **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. - Methods: - process_dataset_entry(data_entry): Processes a single dataset entry, reads text lines from the specified file, and updates the manifest. - Note: This class inherits from the `BaseParallelProcessor` class and extends its functionality to read text lines from a file and update the manifest. """ @@ -162,9 +156,6 @@ class SplitLineBySentence(BaseParallelProcessor): end_pattern (str): The regular expression pattern to identify sentence boundaries. **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. - Methods: - process_dataset_entry(data_entry): Processes a single dataset entry, splitting the input text lines into sentences based on the specified pattern, and updates the dataset. - Note: This class inherits from the `BaseParallelProcessor` class and extends its functionality to split lines of text into sentences based on a specified pattern. """ @@ -210,9 +201,6 @@ class CountNumWords(BaseParallelProcessor): alphabet (str): The alphabet to be used for word tokenization. **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. - Methods: - process_dataset_entry(data_entry): Processes a single dataset entry, counts the number of words, and updates the dataset. - Note: This class inherits from the `BaseParallelProcessor` class and extends its functionality to count the number of words in a text and update the dataset. """ diff --git a/sdp/processors/modify_manifest/data_to_dropbool.py b/sdp/processors/modify_manifest/data_to_dropbool.py index db1aad94..0f871b26 100644 --- a/sdp/processors/modify_manifest/data_to_dropbool.py +++ b/sdp/processors/modify_manifest/data_to_dropbool.py @@ -40,9 +40,6 @@ class PreserveByValue(BaseParallelProcessor): operator (str, optional): The operator to apply for comparison. Options: "lt" (less than), "le" (less than or equal to), "eq" (equal to), "ne" (not equal to), "ge" (greater than or equal to), "gt" (greater than). Defaults to "eq". **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. - Methods: - process_dataset_entry(data_entry): Processes a single dataset entry, preserving it based on the specified condition. - Note: This class inherits from the `BaseParallelProcessor` class and extends its functionality to selectively preserve dataset entries. """ From f869773de1a401479247db6cc07373fcc572dad6 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Thu, 14 Dec 2023 12:29:21 -0800 Subject: [PATCH 060/115] rm Note Signed-off-by: Nikolay Karpov --- sdp/processors/langs/armenian.py | 14 ++++---------- sdp/processors/modify_manifest/create_manifest.py | 2 -- sdp/processors/modify_manifest/data_to_data.py | 8 -------- sdp/processors/modify_manifest/data_to_dropbool.py | 2 -- 4 files changed, 4 insertions(+), 22 deletions(-) diff --git a/sdp/processors/langs/armenian.py b/sdp/processors/langs/armenian.py index 2931217a..87664b61 100644 --- a/sdp/processors/langs/armenian.py +++ b/sdp/processors/langs/armenian.py @@ -14,8 +14,6 @@ class GetSourceFolder(BaseParallelProcessor): source_field (str): The field to store the extracted source folder in the manifest. **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. - Note: - This class inherits from the `BaseParallelProcessor` class and extends its functionality to extract source information from file paths and update the manifest. """ def __init__( self, @@ -24,13 +22,13 @@ def __init__( **kwargs, ): super().__init__(**kwargs) - self.input_field = source_filepath - self.output_field = source_field + self.source_filepath = source_filepath + self.source_field = source_field def process_dataset_entry(self, data_entry): - input_values = os.path.splitext(data_entry[self.input_field])[0].split("/") + input_values = os.path.splitext(data_entry[self.source_filepath])[0].split("/") - data_entry[self.output_field] = input_values[-1] + data_entry[self.source_field] = input_values[-1] return [DataEntry(data=data_entry)] @@ -41,8 +39,6 @@ class MakeTsv(BaseProcessor): Args: **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`. - Note: - This class inherits from the `BaseProcessor` class and provides functionality to convert a JSON manifest file to a TSV file. """ def __init__( self, @@ -63,8 +59,6 @@ class RandomTsvPart(BaseProcessor): random_state (int): Seed for reproducibility when generating the random subset. **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`. - Note: - This class inherits from the `BaseProcessor` class and provides functionality to create a random subset of a TSV file. """ def __init__( self, diff --git a/sdp/processors/modify_manifest/create_manifest.py b/sdp/processors/modify_manifest/create_manifest.py index ffbf6527..77e306f9 100644 --- a/sdp/processors/modify_manifest/create_manifest.py +++ b/sdp/processors/modify_manifest/create_manifest.py @@ -12,8 +12,6 @@ class CreateInitialManifestByExt(BaseParallelProcessor): extension (str): The field stecify extension of the file in the dataset. **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. - Note: - This class inherits from the `BaseParallelProcessor` class and extends its functionality to create an initial dataset manifest from image and text files with common keys. """ def __init__( diff --git a/sdp/processors/modify_manifest/data_to_data.py b/sdp/processors/modify_manifest/data_to_data.py index faae7041..b4f4a520 100644 --- a/sdp/processors/modify_manifest/data_to_data.py +++ b/sdp/processors/modify_manifest/data_to_data.py @@ -69,8 +69,6 @@ class FfmpegConvert(BaseParallelProcessor): target_nchannels (int, optional): The target number of channels for the resampled audio. Defaults to 1. **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. - Note: - This class inherits from the `BaseParallelProcessor` class and extends its functionality to convert video files to resampled audio using FFmpeg. """ def __init__( self, @@ -121,8 +119,6 @@ class ReadTxtLines(BaseParallelProcessor): text_key (str): The field to store the read text lines in the manifest. **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. - Note: - This class inherits from the `BaseParallelProcessor` class and extends its functionality to read text lines from a file and update the manifest. """ def __init__( self, @@ -156,8 +152,6 @@ class SplitLineBySentence(BaseParallelProcessor): end_pattern (str): The regular expression pattern to identify sentence boundaries. **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. - Note: - This class inherits from the `BaseParallelProcessor` class and extends its functionality to split lines of text into sentences based on a specified pattern. """ def __init__( self, @@ -201,8 +195,6 @@ class CountNumWords(BaseParallelProcessor): alphabet (str): The alphabet to be used for word tokenization. **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. - Note: - This class inherits from the `BaseParallelProcessor` class and extends its functionality to count the number of words in a text and update the dataset. """ def __init__( self, diff --git a/sdp/processors/modify_manifest/data_to_dropbool.py b/sdp/processors/modify_manifest/data_to_dropbool.py index 0f871b26..f286aa29 100644 --- a/sdp/processors/modify_manifest/data_to_dropbool.py +++ b/sdp/processors/modify_manifest/data_to_dropbool.py @@ -40,8 +40,6 @@ class PreserveByValue(BaseParallelProcessor): operator (str, optional): The operator to apply for comparison. Options: "lt" (less than), "le" (less than or equal to), "eq" (equal to), "ne" (not equal to), "ge" (greater than or equal to), "gt" (greater than). Defaults to "eq". **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. - Note: - This class inherits from the `BaseParallelProcessor` class and extends its functionality to selectively preserve dataset entries. """ def __init__( self, From 4224052b3849830723b1f0623ce7a0e9a5974f41 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Thu, 14 Dec 2023 22:20:30 -0800 Subject: [PATCH 061/115] more fixes Signed-off-by: Nikolay Karpov --- dataset_configs/armenian/text.yaml | 3 +- sdp/processors/langs/armenian.py | 6 +- sdp/processors/modify_manifest/common.py | 57 ------------------- .../modify_manifest/create_manifest.py | 6 +- .../modify_manifest/data_to_data.py | 15 +++-- .../modify_manifest/data_to_dropbool.py | 2 + sdp/utils/common.py | 10 ++-- 7 files changed, 23 insertions(+), 76 deletions(-) diff --git a/dataset_configs/armenian/text.yaml b/dataset_configs/armenian/text.yaml index af5940c3..6c82214d 100644 --- a/dataset_configs/armenian/text.yaml +++ b/dataset_configs/armenian/text.yaml @@ -1,3 +1,4 @@ +# Processing pipeline for text corpus to submit to Common Voice https://common-voice.github.io/community-playbook/sub_pages/text.html processors_to_run: "0:" workspace_dir: ??? @@ -80,7 +81,7 @@ processors: target_value: 3 operator: ge - - _target_: sdp.processors.langs.armenian.GetSourceFolder + - _target_: sdp.processors.langs.armenian.GetSourceBookName output_manifest_file: ${workspace_dir}/manifest10.json source_filepath: source_filepath source_field: Source diff --git a/sdp/processors/langs/armenian.py b/sdp/processors/langs/armenian.py index 87664b61..18f34f62 100644 --- a/sdp/processors/langs/armenian.py +++ b/sdp/processors/langs/armenian.py @@ -5,13 +5,13 @@ from sdp.utils.common import load_manifest -class GetSourceFolder(BaseParallelProcessor): +class GetSourceBookName(BaseParallelProcessor): """ - Processor for extracting source folder from file paths and updating the manifest. + Processor for extracting source book name from file paths and updating the manifest. Args: source_filepath (str): The field containing the file path in the manifest. - source_field (str): The field to store the extracted source folder in the manifest. + source_field (str): The field to store the extracted source book name in the manifest. **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. """ diff --git a/sdp/processors/modify_manifest/common.py b/sdp/processors/modify_manifest/common.py index 2b57f15c..97e86e07 100644 --- a/sdp/processors/modify_manifest/common.py +++ b/sdp/processors/modify_manifest/common.py @@ -11,63 +11,6 @@ DataEntry, ) -class Subprocess(BaseProcessor): - """ - Processor for handling subprocess execution with additional features for managing input and output manifests. - - Args: - cmd (str): The command to be executed as a subprocess. - input_manifest_arg (str, optional): The argument specifying the input manifest. Defaults to an empty string. - output_manifest_arg (str, optional): The argument specifying the output manifest. Defaults to an empty string. - arg_separator (str, optional): The separator used between argument and value. Defaults to "=". - **kwargs: Additional keyword arguments to be passed to the base class. - - Example:: - - _target_: sdp.processors.datasets.commoncrawl.Subprocess - output_manifest_file: /workspace/manifest.json - input_manifest_arg: "--manifest" - output_manifest_arg: "--output_filename" - arg_separator: "=" - cmd: "python /workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \ - --language=en --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache --overwrite_cache \ - --whitelist=/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv" - - """ - def __init__( - self, - cmd: str, - input_manifest_arg: str = "", - output_manifest_arg: str = "", - arg_separator: str = "=", - **kwargs, - ): - super().__init__(**kwargs) - self.input_manifest_arg = input_manifest_arg - self.output_manifest_arg = output_manifest_arg - self.arg_separator = arg_separator - self.cmd = cmd - - def process(self): - os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True) - if self.cmd.find(self.input_manifest_file) != -1 or self.cmd.find(self.output_manifest_file) != -1: - logger.error("input_manifest_file "+self.input_manifest_file+" and output_manifest_file "+self.output_manifest_file+" should be exluded from cmd line!") - raise ValueError - process_args = [x for x in self.cmd.split(" ") if x] - if self.arg_separator == " ": - if self.input_manifest_arg: - process_args.extend([self.input_manifest_arg, self.input_manifest_file]) - if self.output_manifest_arg: - process_args.extend([self.output_manifest_arg, self.output_manifest_file]) - else: - if self.input_manifest_arg: - process_args.extend([self.input_manifest_arg + self.arg_separator + self.input_manifest_file]) - if self.output_manifest_arg: - process_args.extend([self.output_manifest_arg + self.arg_separator + self.output_manifest_file]) - - subprocess.run(process_args) - - class CombineSources(BaseParallelProcessor): """Can be used to create a single field from two alternative sources. diff --git a/sdp/processors/modify_manifest/create_manifest.py b/sdp/processors/modify_manifest/create_manifest.py index 77e306f9..ba678ff7 100644 --- a/sdp/processors/modify_manifest/create_manifest.py +++ b/sdp/processors/modify_manifest/create_manifest.py @@ -7,9 +7,9 @@ class CreateInitialManifestByExt(BaseParallelProcessor): Processor for creating an initial dataset manifest by saving filepaths with a common extension to the field specified in output_field. Args: - raw_data_dir (str): The directory containing image and text files to include in the initial dataset manifest. + raw_data_dir (str): The root directory of the files to be added to the initial manifest. This processor will recursively look for files with the extension 'extension' inside this directory. output_field (str): The field to store the paths to the files in the dataset. - extension (str): The field stecify extension of the file in the dataset. + extension (str): The field stecify extension of the files to use them in the dataset. **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. """ @@ -27,7 +27,7 @@ def __init__( self.extension = extension def read_manifest(self): - input_files = [str(self.raw_data_dir / video) for video in \ + input_files = [str(self.raw_data_dir / file) for file in \ self.raw_data_dir.rglob('*.' + self.extension)] return input_files diff --git a/sdp/processors/modify_manifest/data_to_data.py b/sdp/processors/modify_manifest/data_to_data.py index b4f4a520..c55aa45e 100644 --- a/sdp/processors/modify_manifest/data_to_data.py +++ b/sdp/processors/modify_manifest/data_to_data.py @@ -62,7 +62,7 @@ class FfmpegConvert(BaseParallelProcessor): Args: resampled_audio_dir (str): The directory to store the resampled audio files. - input_field (str): The field in the dataset representing the path to the input video files. + input_field (str): The field in the dataset representing the path to the input video or audio files. output_field (str): The field to store the path to the resampled audio files in the dataset. key_field (str): The field in the dataset representing the unique key or identifier for each entry. target_samplerate (int, optional): The target sampling rate for the resampled audio. Defaults to 16000. @@ -89,7 +89,6 @@ def __init__( self.target_nchannels = target_nchannels def prepare(self): - os.makedirs(os.path.split(self.output_manifest_file)[0], exist_ok=True) os.makedirs(self.resampled_audio_dir, exist_ok=True) def process_dataset_entry(self, data_entry): @@ -187,12 +186,12 @@ def process_dataset_entry(self, data_entry): class CountNumWords(BaseParallelProcessor): """ - Processor for counting the number of words in a text and updating the dataset. + Processor for counting the number of words in the text_key field saving the number in num_words_key. Args: text_key (str): The field containing the input text in the dataset. num_words_key (str): The field to store the number of words in the dataset. - alphabet (str): The alphabet to be used for word tokenization. + alphabet (str): Characters to be used to count words. Any other characters are substituted by whitespace and not take into account. **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. """ @@ -204,17 +203,17 @@ def __init__( **kwargs, ): super().__init__(**kwargs) - self.input_field = text_key - self.output_field = num_words_key + self.text_key = text_key + self.num_words_key = num_words_key self.pattern = re.compile("[^"+alphabet+"]") def process_dataset_entry(self, data_entry): - text = data_entry[self.input_field] + text = data_entry[self.text_key] cleaned_string = self.pattern.sub('', text).strip() cleaned_string = re.sub('\\s+', ' ', cleaned_string).strip() words = cleaned_string.split() num_words = len(words) - data_entry[self.output_field] = num_words + data_entry[self.num_words_key] = num_words return [DataEntry(data=data_entry)] diff --git a/sdp/processors/modify_manifest/data_to_dropbool.py b/sdp/processors/modify_manifest/data_to_dropbool.py index f286aa29..2bd0eb88 100644 --- a/sdp/processors/modify_manifest/data_to_dropbool.py +++ b/sdp/processors/modify_manifest/data_to_dropbool.py @@ -63,6 +63,8 @@ def __init__( self.operator = ge elif operator == "gt": self.operator = gt + else: + raise ValueError('Operator must be one from the list: "lt" (less than), "le" (less than or equal to), "eq" (equal to), "ne" (not equal to), "ge" (greater than or equal to), "gt" (greater than)') def process_dataset_entry(self, data_entry): input_value = data_entry[self.input_field] diff --git a/sdp/utils/common.py b/sdp/utils/common.py index eb70a071..f74faefd 100644 --- a/sdp/utils/common.py +++ b/sdp/utils/common.py @@ -33,11 +33,13 @@ def load_manifest(manifest: Path) -> List[Dict[str, Union[str, float]]]: result.append(data) return result -def ffmpeg_convert(jpg: str, wav: str, ar: int = 0, ac: int = 1): - process_args = ["ffmpeg", "-i", jpg, '-ac', str(ac), "-map", "0:a", "-c:a", "pcm_s16le", "-y", wav] - if ar: +def ffmpeg_convert(input_file: str, output_wav: str, sample_rate: int = 0, num_channels: int = 1): + process_args = ["ffmpeg", "-i", input_file, + '-ac', str(num_channels), "-map", "0:a", "-c:a", + "pcm_s16le", "-y", output_wav] + if sample_rate: process_args = process_args[:-1] - process_args.extend(["-ar", str(ar), wav]) + process_args.extend(["-ar", str(sample_rate), output_wav]) return subprocess.run(process_args, stdout = subprocess.DEVNULL, stderr = subprocess.DEVNULL) def download_file(source_url: str, target_directory: str, verbose = True): From 8289f822f32fad33fbd37c6326fc62125fd59934 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Mon, 18 Dec 2023 06:29:23 -0800 Subject: [PATCH 062/115] header Signed-off-by: Nikolay Karpov --- dataset_configs/armenian/audio_books.yaml | 3 ++- dataset_configs/armenian/mcv.yaml | 3 ++- dataset_configs/armenian/text.yaml | 3 ++- .../huggingface/speech_recognition.py | 18 +++++++++++++++-- sdp/processors/langs/armenian.py | 20 +++++++++++++------ sdp/processors/modify_manifest/common.py | 15 +++++++++++++- .../modify_manifest/create_manifest.py | 14 +++++++++++++ .../modify_manifest/data_to_data.py | 2 +- .../modify_manifest/data_to_dropbool.py | 2 +- sdp/utils/common.py | 2 +- 10 files changed, 67 insertions(+), 15 deletions(-) diff --git a/dataset_configs/armenian/audio_books.yaml b/dataset_configs/armenian/audio_books.yaml index 767e2829..f50c2f41 100644 --- a/dataset_configs/armenian/audio_books.yaml +++ b/dataset_configs/armenian/audio_books.yaml @@ -1,5 +1,6 @@ processors_to_run: "0:" workspace_dir: ??? +final_manifest: ${workspace_dir}/final_manifest.json processors: - _target_: sdp.processors.CreateInitialManifestByExt @@ -39,7 +40,7 @@ processors: - {input: {text: "Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}, output: {text: "Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}} - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest6.json + output_manifest_file: ${final_manifest} regex_params_list: - {"pattern": '\[(.*?)\]', "repl": ' '} - {"pattern": 'a', "repl": "ա"} diff --git a/dataset_configs/armenian/mcv.yaml b/dataset_configs/armenian/mcv.yaml index b8440386..77a28f35 100644 --- a/dataset_configs/armenian/mcv.yaml +++ b/dataset_configs/armenian/mcv.yaml @@ -1,5 +1,6 @@ processors_to_run: "0:" workspace_dir: ??? +final_manifest: ${workspace_dir}/final_manifest.json processors: - _target_: sdp.processors.CreateInitialManifestMCV # conda install -c conda-forge -y sox libvorbis @@ -38,7 +39,7 @@ processors: wer_threshold: 75 - _target_: sdp.processors.DropHighCER - output_manifest_file: ${workspace_dir}/manifest6.json + output_manifest_file: ${final_manifest} text_key: text pred_text_key: pred_text3 cer_threshold: 30 \ No newline at end of file diff --git a/dataset_configs/armenian/text.yaml b/dataset_configs/armenian/text.yaml index 6c82214d..0939d55f 100644 --- a/dataset_configs/armenian/text.yaml +++ b/dataset_configs/armenian/text.yaml @@ -1,6 +1,7 @@ # Processing pipeline for text corpus to submit to Common Voice https://common-voice.github.io/community-playbook/sub_pages/text.html processors_to_run: "0:" workspace_dir: ??? +final_manifest: ${workspace_dir}/final_manifest.json processors: - _target_: sdp.processors.CreateInitialManifestByExt @@ -91,7 +92,7 @@ processors: rename_fields: {"text": "Sentence"} - _target_: sdp.processors.KeepOnlySpecifiedFields - output_manifest_file: ${workspace_dir}/manifest12.json + output_manifest_file: ${final_manifest} fields_to_keep: ["Sentence", "Source"] - _target_: sdp.processors.langs.armenian.MakeTsv diff --git a/sdp/processors/huggingface/speech_recognition.py b/sdp/processors/huggingface/speech_recognition.py index 9d55a4a7..a1a0313a 100644 --- a/sdp/processors/huggingface/speech_recognition.py +++ b/sdp/processors/huggingface/speech_recognition.py @@ -1,3 +1,17 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import json from tqdm import tqdm from pathlib import Path @@ -19,7 +33,7 @@ def __init__( pretrained_model: str, output_text_field: str, device: str = None, - batch_size: str = 1, + batch_size: int = 1, **kwargs, ): super().__init__(**kwargs) @@ -78,7 +92,7 @@ def __init__( pretrained_model: str, output_text_field: str, device: str = None, - batch_size: str = 1, + batch_size: int = 1, **kwargs, ): super().__init__(**kwargs) diff --git a/sdp/processors/langs/armenian.py b/sdp/processors/langs/armenian.py index 18f34f62..1e290d29 100644 --- a/sdp/processors/langs/armenian.py +++ b/sdp/processors/langs/armenian.py @@ -1,3 +1,17 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os import pandas as pd from pathlib import Path @@ -40,12 +54,6 @@ class MakeTsv(BaseProcessor): **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`. """ - def __init__( - self, - **kwargs, - ): - super().__init__(**kwargs) - def process(self): df1 = pd.DataFrame.from_records(load_manifest(Path(self.input_manifest_file))) df1.to_csv(self.output_manifest_file, index=None, sep='\t') diff --git a/sdp/processors/modify_manifest/common.py b/sdp/processors/modify_manifest/common.py index 97e86e07..e164ffa1 100644 --- a/sdp/processors/modify_manifest/common.py +++ b/sdp/processors/modify_manifest/common.py @@ -1,6 +1,19 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import json import os -import subprocess from typing import Dict, List from tqdm import tqdm diff --git a/sdp/processors/modify_manifest/create_manifest.py b/sdp/processors/modify_manifest/create_manifest.py index ba678ff7..335724ca 100644 --- a/sdp/processors/modify_manifest/create_manifest.py +++ b/sdp/processors/modify_manifest/create_manifest.py @@ -1,3 +1,17 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from pathlib import Path from sdp.processors.base_processor import BaseParallelProcessor, DataEntry diff --git a/sdp/processors/modify_manifest/data_to_data.py b/sdp/processors/modify_manifest/data_to_data.py index c55aa45e..9c8b1a84 100644 --- a/sdp/processors/modify_manifest/data_to_data.py +++ b/sdp/processors/modify_manifest/data_to_data.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/sdp/processors/modify_manifest/data_to_dropbool.py b/sdp/processors/modify_manifest/data_to_dropbool.py index 2bd0eb88..640f2dd0 100644 --- a/sdp/processors/modify_manifest/data_to_dropbool.py +++ b/sdp/processors/modify_manifest/data_to_dropbool.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/sdp/utils/common.py b/sdp/utils/common.py index f74faefd..fd6b837b 100644 --- a/sdp/utils/common.py +++ b/sdp/utils/common.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From bd42a6c6ad06fc9d1a27a83748a2053b9a58ad99 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Mon, 18 Dec 2023 07:07:16 -0800 Subject: [PATCH 063/115] ASRWhisper Signed-off-by: Nikolay Karpov --- .../huggingface/speech_recognition.py | 21 +++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/sdp/processors/huggingface/speech_recognition.py b/sdp/processors/huggingface/speech_recognition.py index a1a0313a..fe575b78 100644 --- a/sdp/processors/huggingface/speech_recognition.py +++ b/sdp/processors/huggingface/speech_recognition.py @@ -20,13 +20,14 @@ class ASRWhisper(BaseProcessor): """ - Processor to transcribe using ASR Whisper model from HuggingFace. + Simple example to transcribe using ASR Whisper model from HuggingFace. + There are many ways to improve it: make batch inference, split long files, return predicted language, etc. Args: pretrained_model (str): name of pretrained model on HuggingFace. output_text_field (str): field to save transcription result. device (str): Inference device. - batch_size (str): Inference batch size. + batch_size (int): Inference batch size. Defaults to 1. """ def __init__( self, @@ -85,14 +86,16 @@ class ASRTransformer(BaseProcessor): pretrained_model (str): name of pretrained model on HuggingFace. output_text_field (str): field to save transcription result. device (str): Inference device. - batch_size (str): Inference batch size. + batch_size (int): Inference batch size. Used only batch_size = 1 TODO: support batch_size > 1 + torch_dtype (str): Tensor data type. Default to "float32" """ def __init__( self, pretrained_model: str, output_text_field: str, device: str = None, - batch_size: int = 1, + batch_size: int = 1, # TODO: support batch_size > 1 + torch_dtype: str = "float32", **kwargs, ): super().__init__(**kwargs) @@ -103,14 +106,20 @@ def __init__( self.output_text_field = output_text_field self.device = device self.batch_size = batch_size + if torch_dtype == "float32": + self.torch_dtype = torch.float32 + elif torch_dtype == "float16": + self.torch_dtype = torch.float16 + else: + raise NotImplementedError(torch_dtype + " is not implemented!") + if self.device is None: if torch.cuda.is_available(): self.device = "cuda:0" else: self.device = "cpu" - torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 - self.model = AutoModelForSpeechSeq2Seq.from_pretrained(self.pretrained_model, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True) + self.model = AutoModelForSpeechSeq2Seq.from_pretrained(self.pretrained_model, torch_dtype=self.torch_dtype, low_cpu_mem_usage=True, use_safetensors=True) self.model.to(self.device) processor = AutoProcessor.from_pretrained(self.pretrained_model) From b2c1f0d0759ce02c972268cabfc160162332bc2f Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Tue, 9 Jan 2024 04:30:46 -0800 Subject: [PATCH 064/115] AudioLid args Signed-off-by: Nikolay Karpov --- .../datasets/commoncrawl/commoncrawl.py | 220 ++++++++---------- 1 file changed, 91 insertions(+), 129 deletions(-) diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py index 243a12bc..b0ceaeb0 100644 --- a/sdp/processors/datasets/commoncrawl/commoncrawl.py +++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py @@ -429,29 +429,28 @@ def process_dataset_entry(self, data_entry): return [DataEntry(data=data_entry)] class Subprocess(BaseProcessor): - """ - A class for handling subprocess execution with additional features for managing input and output manifests. - - Parameters: - - cmd (str): The command to be executed as a subprocess. - - input_manifest_arg (str, optional): The argument specifying the input manifest. Defaults to an empty string. - - output_manifest_arg (str, optional): The argument specifying the output manifest. Defaults to an empty string. - - arg_separator (str, optional): The separator used between argument and value. Defaults to "=". - - **kwargs: Additional keyword arguments to be passed to the base class. + """ + Processor for handling subprocess execution with additional features for managing input and output manifests. - Attributes: - - input_manifest_arg (str): The argument specifying the input manifest. - - output_manifest_arg (str): The argument specifying the output manifest. - - arg_separator (str): The separator used between argument and value. - - cmd (str): The command to be executed. + Args: + cmd (str): The command to be executed as a subprocess. + input_manifest_arg (str, optional): The argument specifying the input manifest. Defaults to an empty string. + output_manifest_arg (str, optional): The argument specifying the output manifest. Defaults to an empty string. + arg_separator (str, optional): The separator used between argument and value. Defaults to "=". + **kwargs: Additional keyword arguments to be passed to the base class. - Methods: - - process(): Executes the subprocess, handling input and output manifest arguments and ensuring they are not included in the command line. + Example: + + _target_: sdp.processors.datasets.commoncrawl.Subprocess + output_manifest_file: /workspace/manifest.json + input_manifest_arg: "--manifest" + output_manifest_arg: "--output_filename" + arg_separator: "=" + cmd: "python /workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \ + --language=en --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache --overwrite_cache \ + --whitelist=/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv" - Note: - - The `BaseProcessor` class is assumed to be the base class, providing common functionality. """ - def __init__( self, cmd: str, @@ -490,23 +489,12 @@ class NmtSubprocess(Subprocess): A class for executing Neural Machine Translation (NMT) subprocess with enhanced functionality for managing input and output fields. Parameters: - - input_field (str): The field in the input manifest containing the source text for translation. - - output_field (str): The field to store the translated output in the output manifest. - - srctext_file (str): The file path to store the source text for translation. - - tgtout_file (str): The file path to store the translated output. - - **kwargs: Additional keyword arguments to be passed to the base class `Subprocess`. - - Attributes: - - input_field (str): The field in the input manifest containing the source text for translation. - - output_field (str): The field to store the translated output in the output manifest. - - srctext_file (str): The file path to store the source text for translation. - - tgtout_file (str): The file path to store the translated output. + input_field (str): The field in the input manifest containing the source text for translation. + output_field (str): The field to store the translated output in the output manifest. + srctext_file (str): The file path to store the source text for translation. + tgtout_file (str): The file path to store the translated output. + **kwargs: Additional keyword arguments to be passed to the base class `Subprocess`. - Methods: - - process(): Executes the NMT subprocess, handling source text and translation output fields. - - Note: - - This class inherits from the `Subprocess` class and extends its functionality to handle NMT-specific processing. """ def __init__( @@ -542,19 +530,10 @@ class AlignerSubprocess(Subprocess): A class for aligning audio transcripts using an aligner subprocess with additional features for managing output fields. Parameters: - - output_field (str): The field in the output manifest to store the aligned transcripts. - - duration_threshold (int, optional): The maximum duration threshold for audio files in seconds. Files exceeding this threshold are excluded from alignment. Defaults to 5000. - - **kwargs: Additional keyword arguments to be passed to the base class `Subprocess`. - - Attributes: - - output_field (str): The field in the output manifest to store the aligned transcripts. - - duration_threshold (int): The maximum duration threshold for audio files in seconds. + output_field (str): The field in the output manifest to store the aligned transcripts. + duration_threshold (int, optional): The maximum duration threshold for audio files in seconds. Files exceeding this threshold are excluded from alignment. Defaults to 5000. + **kwargs: Additional keyword arguments to be passed to the base class `Subprocess`. - Methods: - - process(): Executes the aligner subprocess, handling text processing, duration filtering, alignment, and manifest updates. - - Note: - - This class inherits from the `Subprocess` class and extends its functionality to handle aligner-specific processing. """ def __init__( @@ -611,22 +590,12 @@ class PreserveByValue(BaseParallelProcessor): A class for preserving dataset entries based on a specified condition involving a target value and an input field. Parameters: - - input_field (str): The field in the dataset entries to be evaluated. - - target_value (Union[int, str]): The value to compare with the input field. - - operator (str, optional): The operator to apply for comparison. Options: "lt" (less than), "le" (less than or equal to), + input_field (str): The field in the dataset entries to be evaluated. + target_value (Union[int, str]): The value to compare with the input field. + operator (str, optional): The operator to apply for comparison. Options: "lt" (less than), "le" (less than or equal to), "eq" (equal to), "ne" (not equal to), "ge" (greater than or equal to), "gt" (greater than). Defaults to "eq". - - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. + **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. - Attributes: - - input_field (str): The field in the dataset entries to be evaluated. - - target_value (Union[int, str]): The value to compare with the input field. - - operator (function): The operator function based on the specified operator. - - Methods: - - process_dataset_entry(data_entry): Processes a single dataset entry, preserving it based on the specified condition. - - Note: - - This class inherits from the `BaseParallelProcessor` class and extends its functionality to selectively preserve dataset entries. """ def __init__( self, @@ -864,14 +833,15 @@ class AudioLid(BaseProcessor): A class for language identification (LID) of audio files using a pre-trained LID model. Args: - - input_audio_field (str): The field in the dataset containing the path to the audio files for language identification. - - pretrained_model (str): The name of the pre-trained ASR model for language identification. - - output_lang_field (str): The field to store the identified language for each audio file. - - device (str): The device to run the ASR model on (e.g., 'cuda', 'cpu'). If None, it automatically selects the available GPU if present; otherwise, it uses the CPU. - - **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`. - - Note: - - This class inherits from the `BaseProcessor` class and extends its functionality to perform language identification using a pre-trained ASR model. + input_audio_field (str): The field in the dataset containing the path to the audio files for language identification. + pretrained_model (str): The name of the pre-trained ASR model for language identification. + output_lang_field (str): The field to store the identified language for each audio file. + device (str): The device to run the ASR model on (e.g., 'cuda', 'cpu'). If None, it automatically selects the available GPU if present; otherwise, it uses the CPU. + segment_duration (float): Random sample duration in seconds. Delault is np.inf. + num_segments (int): Number of segments of file to use for majority vote. Delault is 1. + random_seed (int): Seed for generating the starting position of the segment. Delault is None. + **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`. + """ def __init__( self, @@ -879,12 +849,18 @@ def __init__( pretrained_model: str, output_lang_field: str, device: str, + segment_duration: float = np.inf, + num_segments: int = 1, + random_seed: int = None, **kwargs, ): super().__init__(**kwargs) self.input_audio_field = input_audio_field self.pretrained_model = pretrained_model self.output_lang_field = output_lang_field + self.segment_duration = segment_duration + self.num_segments = num_segments + self.random_seed = random_seed self.device = device def process(self): @@ -909,7 +885,7 @@ def process(self): audio_file = item[self.input_audio_field] try: - lang = model.get_label(audio_file, 60*5) + lang = model.get_label(audio_file, self.segment_duration, self.num_segments) except Exception as e: logger.warning("AudioLid " + audio_file+ " " + str(e)) lang = None @@ -924,18 +900,16 @@ class TextLid(BaseProcessor): A class for language identification (LID) of text using a pre-trained text classification model. Args: - - input_text_field (str): The field in the dataset containing the text for language identification. - - pretrained_model (str): The name or path of the pre-trained text classification model for language identification. - - output_lang_field (str): The field to store the identified language for each text. - - device (str): The device to run the text classification model on (e.g., 'cuda', 'cpu'). If None, it automatically selects the available GPU if present; otherwise, it uses the CPU. - - drop_text_duplicates (bool, optional): If True, drops duplicate texts from the output manifest. Defaults to False. - - **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`. + input_text_field (str): The field in the dataset containing the text for language identification. + pretrained_model (str): The name or path of the pre-trained text classification model for language identification. + output_lang_field (str): The field to store the identified language for each text. + device (str): The device to run the text classification model on (e.g., 'cuda', 'cpu'). If None, it automatically selects the available GPU if present; otherwise, it uses the CPU. + drop_text_duplicates (bool, optional): If True, drops duplicate texts from the output manifest. Defaults to False. + **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`. Methods: - process(): Processes the language identification for each text in the dataset and saves the results in a new manifest file. - Note: - - This class inherits from the `BaseProcessor` class and extends its functionality to perform language identification using a pre-trained text classification model. """ def __init__( self, @@ -991,15 +965,13 @@ class AllVttText(BaseParallelProcessor): A class for extracting text content from VTT (WebVTT) files and updating the manifest. Args: - - output_text_field (str): The field to store the extracted text content in the manifest. - - input_filepath_field (str, optional): The field in the manifest containing the path to VTT files. Defaults to "vtt_filepath". - - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. + output_text_field (str): The field to store the extracted text content in the manifest. + input_filepath_field (str, optional): The field in the manifest containing the path to VTT files. Defaults to "vtt_filepath". + **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. Methods: - - process_dataset_entry(data_entry): Processes a single dataset entry, extracts text content from the specified VTT file, and updates the manifest. + process_dataset_entry(data_entry): Processes a single dataset entry, extracts text content from the specified VTT file, and updates the manifest. - Note: - - This class inherits from the `BaseParallelProcessor` class and extends its functionality to extract text content from VTT files and update the manifest. """ def __init__( self, @@ -1028,18 +1000,16 @@ class TxtToVtt(BaseParallelProcessor): A class for converting text files to WebVTT (VTT) format and updating the manifest. Args: - - vtt_files_dir (str): The directory where the generated VTT files will be saved. - - key_field (str): The field in the manifest representing the unique key or identifier for each entry. - - text_field (str): The field in the manifest containing the text content to be converted to VTT format. - - vtt_field (str): The field to store the generated VTT file paths in the manifest. - - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. + vtt_files_dir (str): The directory where the generated VTT files will be saved. + key_field (str): The field in the manifest representing the unique key or identifier for each entry. + text_field (str): The field in the manifest containing the text content to be converted to VTT format. + vtt_field (str): The field to store the generated VTT file paths in the manifest. + **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. Methods: - - prepare(): Creates the directory for saving the generated VTT files. - - process_dataset_entry(data_entry): Processes a single dataset entry, converts the text content to VTT format, and updates the manifest. + prepare(): Creates the directory for saving the generated VTT files. + process_dataset_entry(data_entry): Processes a single dataset entry, converts the text content to VTT format, and updates the manifest. - Note: - - This class inherits from the `BaseParallelProcessor` class and extends its functionality to convert text files to WebVTT (VTT) format and update the manifest. """ def __init__( self, @@ -1078,18 +1048,16 @@ class ReadParquet(BaseParallelProcessor): A class for reading information from Parquet files and updating the manifest with video URLs and captions. Args: - - output_video_field (str): The field to store the extracted video URLs in the manifest. - - output_caption_field (str): The field to store the extracted captions in the manifest. - - key_field (str): The field in the manifest representing the unique key or identifier for each entry. - - raw_data_dir (str): The directory containing Parquet files with information to be read. - - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. + output_video_field (str): The field to store the extracted video URLs in the manifest. + output_caption_field (str): The field to store the extracted captions in the manifest. + key_field (str): The field in the manifest representing the unique key or identifier for each entry. + raw_data_dir (str): The directory containing Parquet files with information to be read. + **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. Methods: - prepare(): Reads and prepares information from Parquet files, storing it in the `urls` DataFrame. - process_dataset_entry(data_entry): Processes a single dataset entry, extracts video URLs and captions based on the key, and updates the manifest. - Note: - - This class inherits from the `BaseParallelProcessor` class and extends its functionality to read information from Parquet files and update the manifest with video URLs and captions. """ def __init__( self, @@ -1139,19 +1107,17 @@ class CreateInitialManifestCC(BaseParallelProcessor): A class for creating an initial dataset manifest from image and text files with common keys. Args: - - raw_data_dir (str): The directory containing image and text files to include in the initial dataset manifest. - - video_field (str): The field to store the paths to the image files in the dataset. - - key_field (str): The field to represent the common key or identifier for each entry. - - text_field (str): The field to store the paths to the text files in the dataset. - - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. + raw_data_dir (str): The directory containing image and text files to include in the initial dataset manifest. + video_field (str): The field to store the paths to the image files in the dataset. + key_field (str): The field to represent the common key or identifier for each entry. + text_field (str): The field to store the paths to the text files in the dataset. + **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. Methods: - - prepare(): Creates the directory for saving the initial dataset manifest. - - read_manifest(): Reads the image and text files, extracts common keys, and creates a DataFrame with video, key, and text fields. - - process_dataset_entry(data_entry): Processes a single dataset entry, creating a DataEntry object with video, key, and text fields, and updates the dataset. + prepare(): Creates the directory for saving the initial dataset manifest. + read_manifest(): Reads the image and text files, extracts common keys, and creates a DataFrame with video, key, and text fields. + process_dataset_entry(data_entry): Processes a single dataset entry, creating a DataEntry object with video, key, and text fields, and updates the dataset. - Note: - - This class inherits from the `BaseParallelProcessor` class and extends its functionality to create an initial dataset manifest from image and text files with common keys. """ def __init__( self, @@ -1198,19 +1164,17 @@ class FfmpegConvert(BaseParallelProcessor): A class for converting video files to audio using FFmpeg and updating the dataset with the path to the resampled audio. Args: - - resampled_audio_dir (str): The directory to store the resampled audio files. - - input_field (str): The field in the dataset representing the path to the input video files. - - output_field (str): The field to store the path to the resampled audio files in the dataset. - - key_field (str): The field in the dataset representing the unique key or identifier for each entry. - - target_samplerate (int, optional): The target sampling rate for the resampled audio. Defaults to 16000. - - target_nchannels (int, optional): The target number of channels for the resampled audio. Defaults to 1. - - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. + resampled_audio_dir (str): The directory to store the resampled audio files. + input_field (str): The field in the dataset representing the path to the input video files. + output_field (str): The field to store the path to the resampled audio files in the dataset. + key_field (str): The field in the dataset representing the unique key or identifier for each entry. + target_samplerate (int, optional): The target sampling rate for the resampled audio. Defaults to 16000. + target_nchannels (int, optional): The target number of channels for the resampled audio. Defaults to 1. + **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. Methods: - - process_dataset_entry(data_entry): Processes a single dataset entry, converts the input video to resampled audio, and updates the dataset. + process_dataset_entry(data_entry): Processes a single dataset entry, converts the input video to resampled audio, and updates the dataset. - Note: - - This class inherits from the `BaseParallelProcessor` class and extends its functionality to convert video files to resampled audio using FFmpeg. """ def __init__( self, @@ -1249,18 +1213,16 @@ class CreateInitialManifestExt(BaseParallelProcessor): A class for creating an initial dataset manifest from audio files with a specified extension. Args: - - raw_data_dir (str): The directory containing audio files to include in the initial dataset manifest. - - output_field (str, optional): The field to store the audio file paths in the dataset. Defaults to "audio_filepath". - - extention (str, optional): The file extension of the audio files to include in the manifest. Defaults to "mp3". - - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. + raw_data_dir (str): The directory containing audio files to include in the initial dataset manifest. + output_field (str, optional): The field to store the audio file paths in the dataset. Defaults to "audio_filepath". + extention (str, optional): The file extension of the audio files to include in the manifest. Defaults to "mp3". + **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. Methods: - - prepare(): Creates the directory for saving the initial dataset manifest. - - read_manifest(): Reads the audio files with the specified extension and creates a DataFrame with the specified output field. - - process_dataset_entry(data_entry): Processes a single dataset entry, creating a DataEntry object with the audio file path, and updates the dataset. + prepare(): Creates the directory for saving the initial dataset manifest. + read_manifest(): Reads the audio files with the specified extension and creates a DataFrame with the specified output field. + process_dataset_entry(data_entry): Processes a single dataset entry, creating a DataEntry object with the audio file path, and updates the dataset. - Note: - - This class inherits from the `BaseParallelProcessor` class and extends its functionality to create an initial dataset manifest from audio files. """ def __init__( self, From 7935e44732aaa51673cea48818fa8ae2652f059d Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Tue, 16 Jan 2024 07:58:59 -0800 Subject: [PATCH 065/115] GetSpecificFiles CopyFiles Signed-off-by: Nikolay Karpov --- dataset_configs/commoncrawl/big_de.yaml | 9 ++- dataset_configs/commoncrawl/big_en.yaml | 18 +++--- dataset_configs/commoncrawl/big_fr.yaml | 11 +++- dataset_configs/commoncrawl/big_pl.yaml | 9 ++- dataset_configs/commoncrawl/big_sentence.yaml | 8 ++- .../datasets/commoncrawl/__init__.py | 2 +- .../datasets/commoncrawl/commoncrawl.py | 57 ++++++++++++++++++- 7 files changed, 97 insertions(+), 17 deletions(-) diff --git a/dataset_configs/commoncrawl/big_de.yaml b/dataset_configs/commoncrawl/big_de.yaml index 63e2256d..372feb34 100644 --- a/dataset_configs/commoncrawl/big_de.yaml +++ b/dataset_configs/commoncrawl/big_de.yaml @@ -151,10 +151,15 @@ processors: - _target_: sdp.processors.AddConstantFields fields: {"lang": '${lang}'} - - _target_: sdp.processors.datasets.commoncrawl.drop_abs_path + - _target_: sdp.processors.datasets.commoncrawl.CopyFiles + file_field: audio_filepath + path_to_copy: ${base_dir}/splited_manifests/${lang}/ + path_levels: 2 + + - _target_: sdp.processors.datasets.commoncrawl.DropAbsPath output_manifest_file: ${workspace_dir}/manifest_${lang}.json path_key: audio_filepath - abs_path_to_drop: ${base_dir}/ + abs_path_to_drop: ${base_dir}/splited_manifests/ - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC output_manifest_file: ${base_dir}/splited_manifests/${lang}_train.json diff --git a/dataset_configs/commoncrawl/big_en.yaml b/dataset_configs/commoncrawl/big_en.yaml index 9a3e6fdc..3be117ef 100644 --- a/dataset_configs/commoncrawl/big_en.yaml +++ b/dataset_configs/commoncrawl/big_en.yaml @@ -97,7 +97,7 @@ processors: - {"pattern": '®', "repl": ' '} # - {"pattern": "%", "repl": ' '} - {"pattern": '@', "repl": " "} - - {"pattern": " ", "repl": " "} + - {"pattern": "\\s+", "repl": " "} - _target_: sdp.processors.DropHighLowWordrate output_manifest_file: ${workspace_dir}/manifest6.json @@ -135,7 +135,7 @@ processors: - {"pattern": '!', "repl": '.'} - {"pattern": '\$', "repl": ""} - {"pattern": "[^A-Za-z'.,?]", "repl": " "} - - {"pattern": ' ', "repl": " "} + - {"pattern": "\\s+", "repl": " "} test_cases: - {input: {text: "' jupiter and venus both shining in the golden rosy sky"}, output: {text: "jupiter and venus both shining in the golden rosy sky"}} - {input: {text: "' may all the gold i have ever dreamed of be yours '"}, output: {text: "may all the gold i have ever dreamed of be yours"}} @@ -160,7 +160,7 @@ processors: regex_params_list: - {"pattern": "[\\?\\.]", "repl": " "} - {"pattern": ",", "repl": " "} - - {"pattern": " ", "repl": " "} + - {"pattern": "\\s+", "repl": " "} - _target_: sdp.processors.DuplicateFields output_manifest_file: ${workspace_dir}/manifest15.json @@ -176,7 +176,7 @@ processors: regex_params_list: - {"pattern": "[\\?\\.]", "repl": " "} - {"pattern": ",", "repl": " "} - - {"pattern": " ", "repl": " "} + - {"pattern": "\\s+", "repl": " "} - _target_: sdp.processors.DropIfRegexMatch output_manifest_file: ${workspace_dir}/manifest18.json @@ -308,13 +308,17 @@ processors: - _target_: sdp.processors.AddConstantFields fields: {"lang": '${lang}'} - - _target_: sdp.processors.datasets.commoncrawl.drop_abs_path + - _target_: sdp.processors.datasets.commoncrawl.CopyFiles + file_field: audio_filepath + path_to_copy: ${base_dir}/splited_manifests/${lang}/ + path_levels: 2 + + - _target_: sdp.processors.datasets.commoncrawl.DropAbsPath output_manifest_file: ${workspace_dir}/manifest_${lang}.json path_key: audio_filepath - abs_path_to_drop: ${base_dir}/ + abs_path_to_drop: ${base_dir}/splited_manifests/ - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC - input_manifest_file: ${workspace_dir}/manifest_${lang}.json output_manifest_file: ${base_dir}/splited_manifests/${lang}_train.json lang: ${lang} data_split: train diff --git a/dataset_configs/commoncrawl/big_fr.yaml b/dataset_configs/commoncrawl/big_fr.yaml index 580beba4..8db40b37 100644 --- a/dataset_configs/commoncrawl/big_fr.yaml +++ b/dataset_configs/commoncrawl/big_fr.yaml @@ -57,7 +57,7 @@ processors: - {"pattern": '„', "repl": '"'} - {"pattern": '®', "repl": ' '} - {"pattern": '@', "repl": " "} - - {"pattern": " ", "repl": " "} + - {"pattern": "\\s+", "repl": " "} - _target_: sdp.processors.DropHighLowWordrate output_manifest_file: ${workspace_dir}/manifest6.json @@ -156,10 +156,15 @@ processors: - _target_: sdp.processors.AddConstantFields fields: {"lang": '${lang}'} - - _target_: sdp.processors.datasets.commoncrawl.drop_abs_path + - _target_: sdp.processors.datasets.commoncrawl.CopyFiles + file_field: audio_filepath + path_to_copy: ${base_dir}/splited_manifests/${lang}/ + path_levels: 2 + + - _target_: sdp.processors.datasets.commoncrawl.DropAbsPath output_manifest_file: ${workspace_dir}/manifest_${lang}.json path_key: audio_filepath - abs_path_to_drop: ${base_dir}/ + abs_path_to_drop: ${base_dir}/splited_manifests/ - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC output_manifest_file: ${base_dir}/splited_manifests/${lang}_train.json diff --git a/dataset_configs/commoncrawl/big_pl.yaml b/dataset_configs/commoncrawl/big_pl.yaml index e3318a32..13b0ee0a 100644 --- a/dataset_configs/commoncrawl/big_pl.yaml +++ b/dataset_configs/commoncrawl/big_pl.yaml @@ -136,11 +136,16 @@ processors: - _target_: sdp.processors.AddConstantFields fields: {"lang": '${lang}'} + + - _target_: sdp.processors.datasets.commoncrawl.CopyFiles + file_field: audio_filepath + path_to_copy: ${base_dir}/splited_manifests/${lang}/ + path_levels: 2 - - _target_: sdp.processors.datasets.commoncrawl.drop_abs_path + - _target_: sdp.processors.datasets.commoncrawl.DropAbsPath output_manifest_file: ${workspace_dir}/manifest_${lang}.json path_key: audio_filepath - abs_path_to_drop: ${base_dir}/ + abs_path_to_drop: ${base_dir}/splited_manifests/ - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC output_manifest_file: ${base_dir}/splited_manifests/${lang}_train.json diff --git a/dataset_configs/commoncrawl/big_sentence.yaml b/dataset_configs/commoncrawl/big_sentence.yaml index a930f770..48bff42c 100644 --- a/dataset_configs/commoncrawl/big_sentence.yaml +++ b/dataset_configs/commoncrawl/big_sentence.yaml @@ -93,4 +93,10 @@ processors: input_manifest_file: ${workspace_dir_s}/manifest5.json output_manifest_file: ${workspace_dir_s}/manifest5a.json input_field: source_audio - output_field: bandwidth \ No newline at end of file + output_field: bandwidth + + - _target_: sdp.processors.datasets.commoncrawl.GetSpecificFiles + input_manifest_file: ${workspace_dir_s}/manifest6.json + output_manifest_file: ${workspace_dir_s}/long_dev_test/manifest6.json + file_field: source_audio + path_to_copy: ${workspace_dir_s}/long_dev_test \ No newline at end of file diff --git a/sdp/processors/datasets/commoncrawl/__init__.py b/sdp/processors/datasets/commoncrawl/__init__.py index 15281419..513c4147 100644 --- a/sdp/processors/datasets/commoncrawl/__init__.py +++ b/sdp/processors/datasets/commoncrawl/__init__.py @@ -16,4 +16,4 @@ Lang2Iso, SplitByVttSentence, SplitByVtt, AudioLid, TextLid, AllVttText, TxtToVtt, \ ReadParquet, CreateInitialManifestCC, FfmpegConvert, ASR_HF, AlignerSubprocess, \ SplitByAligner, JoinBy, EvalBandwidth, CreateInitialManifestExt, AudioDuration, \ - TrainDevTestSplitCC, drop_abs_path + TrainDevTestSplitCC, DropAbsPath, GetSpecificFiles, CopyFiles diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py index 97710f52..974789fd 100644 --- a/sdp/processors/datasets/commoncrawl/commoncrawl.py +++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py @@ -2,6 +2,7 @@ import re import math import json +import shutil import subprocess import librosa from tqdm import tqdm @@ -18,7 +19,7 @@ from sdp.processors.datasets.commoncrawl.harv_utils import ffmpeg_convert, txt2vtt, make_trans_list, get_vtt_text, text2lid, load_manifest, read_jsonl, write_jsonl, split_by_vtt_new, audio_duration from scipy.spatial import distance -class drop_abs_path(BaseParallelProcessor): +class DropAbsPath(BaseParallelProcessor): """ Drop absolute path @@ -42,6 +43,60 @@ def process_dataset_entry(self, data_entry): return [DataEntry(data=data_entry)] +class CopyFiles(BaseParallelProcessor): + def __init__( + self, + file_field : str, + path_to_copy: str, + path_levels: str = 1, + **kwargs, + ): + super().__init__(**kwargs) + self.file_field = file_field + self.path_to_copy = path_to_copy + self.path_levels = path_levels + + def prepare(self): + os.makedirs(self.path_to_copy, exist_ok=True) + + def process_dataset_entry(self, data_entry): + rel_file_path = "/".join(data_entry[self.file_field].split("/")[-self.path_levels:]) + new_file_path = os.path.join(self.path_to_copy, rel_file_path) + + if not os.path.isfile(new_file_path): + os.makedirs(os.path.split(new_file_path)[0], exist_ok=True) + shutil.copyfile(data_entry[self.file_field], new_file_path) + data_entry[self.file_field] = new_file_path + return [DataEntry(data=data_entry)] + + +class GetSpecificFiles(BaseParallelProcessor): + def __init__( + self, + file_field : str, + path_to_copy: str, + **kwargs, + ): + super().__init__(**kwargs) + self.file_field = file_field + self.path_to_copy = path_to_copy + + self.split_map = set( + ['0634236', '0693626', '0029743', '0881322', '0357427', '0455788', '0198472', '0496259', '0812890', '0142281', '0076612', '0629004', '0931592', '0577447', '0768107', '0907768', '0963898', '0671754', '0851569', '0896715', + '0366790', '0837221', '0733702', '0278253', '0738313', '0437256', '0558223', '0292533', '0777911', '0826607', '0544257', '0744206', '0576248', '0307575', '0307577', '0879895', '0006783', '0006755', '0125649', '0896701'] + ) + def prepare(self): + os.makedirs(self.path_to_copy, exist_ok=True) + + def process_dataset_entry(self, data_entry): + file_id = os.path.splitext(data_entry[self.file_field])[0].split("/")[-1] + if file_id in self.split_map: + shutil.copyfile(data_entry[self.file_field],os.path.join(self.path_to_copy, file_id+".wav")) + return [DataEntry(data=data_entry)] + else: + return [] + + class TrainDevTestSplitCC(BaseParallelProcessor): """Custom train-dev-test split for CORAAL dataset. From 39ec1a3b14dfb571447c713082bfa234058b049e Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Thu, 18 Jan 2024 04:10:00 -0800 Subject: [PATCH 066/115] separate dev test Signed-off-by: Nikolay Karpov --- dataset_configs/commoncrawl/big_de.yaml | 44 +++++++++++++++++++------ dataset_configs/commoncrawl/big_en.yaml | 42 ++++++++++++++++++----- dataset_configs/commoncrawl/big_fr.yaml | 42 ++++++++++++++++++----- dataset_configs/commoncrawl/big_pl.yaml | 42 ++++++++++++++++++----- 4 files changed, 133 insertions(+), 37 deletions(-) diff --git a/dataset_configs/commoncrawl/big_de.yaml b/dataset_configs/commoncrawl/big_de.yaml index 372feb34..2f525a21 100644 --- a/dataset_configs/commoncrawl/big_de.yaml +++ b/dataset_configs/commoncrawl/big_de.yaml @@ -149,31 +149,55 @@ processors: rename_fields: {"text_pc":"text"} - _target_: sdp.processors.AddConstantFields + output_manifest_file: ${workspace_dir}/manifest_${lang}.json fields: {"lang": '${lang}'} - + + - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC + input_manifest_file: ${workspace_dir}/manifest_${lang}.json + output_manifest_file: ${workspace_dir}/manifest_${lang}_train.json + lang: ${lang} + data_split: train + - _target_: sdp.processors.datasets.commoncrawl.CopyFiles file_field: audio_filepath - path_to_copy: ${base_dir}/splited_manifests/${lang}/ + path_to_copy: ${base_dir}/splited_manifests/${lang}_train/ path_levels: 2 - _target_: sdp.processors.datasets.commoncrawl.DropAbsPath - output_manifest_file: ${workspace_dir}/manifest_${lang}.json + output_manifest_file: ${base_dir}/splited_manifests/${lang}_train.json path_key: audio_filepath abs_path_to_drop: ${base_dir}/splited_manifests/ - - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC - output_manifest_file: ${base_dir}/splited_manifests/${lang}_train.json - lang: ${lang} - data_split: train - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC input_manifest_file: ${workspace_dir}/manifest_${lang}.json - output_manifest_file: ${base_dir}/splited_manifests/${lang}_dev.json + output_manifest_file: ${workspace_dir}/manifest_${lang}_dev.json lang: ${lang} data_split: dev + - _target_: sdp.processors.datasets.commoncrawl.CopyFiles + file_field: audio_filepath + path_to_copy: ${base_dir}/splited_manifests/${lang}_dev/ + path_levels: 2 + + - _target_: sdp.processors.datasets.commoncrawl.DropAbsPath + output_manifest_file: ${base_dir}/splited_manifests/${lang}_dev.json + path_key: audio_filepath + abs_path_to_drop: ${base_dir}/splited_manifests/ + + - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC input_manifest_file: ${workspace_dir}/manifest_${lang}.json - output_manifest_file: ${base_dir}/splited_manifests/${lang}_test.json + output_manifest_file: ${workspace_dir}/manifest_${lang}_test.json lang: ${lang} - data_split: test \ No newline at end of file + data_split: test + + - _target_: sdp.processors.datasets.commoncrawl.CopyFiles + file_field: audio_filepath + path_to_copy: ${base_dir}/splited_manifests/${lang}_test/ + path_levels: 2 + + - _target_: sdp.processors.datasets.commoncrawl.DropAbsPath + output_manifest_file: ${base_dir}/splited_manifests/${lang}_test.json + path_key: audio_filepath + abs_path_to_drop: ${base_dir}/splited_manifests/ \ No newline at end of file diff --git a/dataset_configs/commoncrawl/big_en.yaml b/dataset_configs/commoncrawl/big_en.yaml index 3be117ef..17dd358a 100644 --- a/dataset_configs/commoncrawl/big_en.yaml +++ b/dataset_configs/commoncrawl/big_en.yaml @@ -306,31 +306,55 @@ processors: rename_fields: {"text_pc":"text"} - _target_: sdp.processors.AddConstantFields + output_manifest_file: ${workspace_dir}/manifest_${lang}.json fields: {"lang": '${lang}'} + - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC + input_manifest_file: ${workspace_dir}/manifest_${lang}.json + output_manifest_file: ${workspace_dir}/manifest_${lang}_train.json + lang: ${lang} + data_split: train + - _target_: sdp.processors.datasets.commoncrawl.CopyFiles file_field: audio_filepath - path_to_copy: ${base_dir}/splited_manifests/${lang}/ + path_to_copy: ${base_dir}/splited_manifests/${lang}_train/ path_levels: 2 - _target_: sdp.processors.datasets.commoncrawl.DropAbsPath - output_manifest_file: ${workspace_dir}/manifest_${lang}.json + output_manifest_file: ${base_dir}/splited_manifests/${lang}_train.json path_key: audio_filepath abs_path_to_drop: ${base_dir}/splited_manifests/ - - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC - output_manifest_file: ${base_dir}/splited_manifests/${lang}_train.json - lang: ${lang} - data_split: train - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC input_manifest_file: ${workspace_dir}/manifest_${lang}.json - output_manifest_file: ${base_dir}/splited_manifests/${lang}_dev.json + output_manifest_file: ${workspace_dir}/manifest_${lang}_dev.json lang: ${lang} data_split: dev + - _target_: sdp.processors.datasets.commoncrawl.CopyFiles + file_field: audio_filepath + path_to_copy: ${base_dir}/splited_manifests/${lang}_dev/ + path_levels: 2 + + - _target_: sdp.processors.datasets.commoncrawl.DropAbsPath + output_manifest_file: ${base_dir}/splited_manifests/${lang}_dev.json + path_key: audio_filepath + abs_path_to_drop: ${base_dir}/splited_manifests/ + + - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC input_manifest_file: ${workspace_dir}/manifest_${lang}.json - output_manifest_file: ${base_dir}/splited_manifests/${lang}_test.json + output_manifest_file: ${workspace_dir}/manifest_${lang}_test.json lang: ${lang} - data_split: test \ No newline at end of file + data_split: test + + - _target_: sdp.processors.datasets.commoncrawl.CopyFiles + file_field: audio_filepath + path_to_copy: ${base_dir}/splited_manifests/${lang}_test/ + path_levels: 2 + + - _target_: sdp.processors.datasets.commoncrawl.DropAbsPath + output_manifest_file: ${base_dir}/splited_manifests/${lang}_test.json + path_key: audio_filepath + abs_path_to_drop: ${base_dir}/splited_manifests/ \ No newline at end of file diff --git a/dataset_configs/commoncrawl/big_fr.yaml b/dataset_configs/commoncrawl/big_fr.yaml index 8db40b37..ffbbbc7f 100644 --- a/dataset_configs/commoncrawl/big_fr.yaml +++ b/dataset_configs/commoncrawl/big_fr.yaml @@ -154,31 +154,55 @@ processors: rename_fields: {"text_pc":"text"} - _target_: sdp.processors.AddConstantFields + output_manifest_file: ${workspace_dir}/manifest_${lang}.json fields: {"lang": '${lang}'} + - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC + input_manifest_file: ${workspace_dir}/manifest_${lang}.json + output_manifest_file: ${workspace_dir}/manifest_${lang}_train.json + lang: ${lang} + data_split: train + - _target_: sdp.processors.datasets.commoncrawl.CopyFiles file_field: audio_filepath - path_to_copy: ${base_dir}/splited_manifests/${lang}/ + path_to_copy: ${base_dir}/splited_manifests/${lang}_train/ path_levels: 2 - _target_: sdp.processors.datasets.commoncrawl.DropAbsPath - output_manifest_file: ${workspace_dir}/manifest_${lang}.json + output_manifest_file: ${base_dir}/splited_manifests/${lang}_train.json path_key: audio_filepath abs_path_to_drop: ${base_dir}/splited_manifests/ - - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC - output_manifest_file: ${base_dir}/splited_manifests/${lang}_train.json - lang: ${lang} - data_split: train - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC input_manifest_file: ${workspace_dir}/manifest_${lang}.json - output_manifest_file: ${base_dir}/splited_manifests/${lang}_dev.json + output_manifest_file: ${workspace_dir}/manifest_${lang}_dev.json lang: ${lang} data_split: dev + - _target_: sdp.processors.datasets.commoncrawl.CopyFiles + file_field: audio_filepath + path_to_copy: ${base_dir}/splited_manifests/${lang}_dev/ + path_levels: 2 + + - _target_: sdp.processors.datasets.commoncrawl.DropAbsPath + output_manifest_file: ${base_dir}/splited_manifests/${lang}_dev.json + path_key: audio_filepath + abs_path_to_drop: ${base_dir}/splited_manifests/ + + - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC input_manifest_file: ${workspace_dir}/manifest_${lang}.json - output_manifest_file: ${base_dir}/splited_manifests/${lang}_test.json + output_manifest_file: ${workspace_dir}/manifest_${lang}_test.json lang: ${lang} - data_split: test \ No newline at end of file + data_split: test + + - _target_: sdp.processors.datasets.commoncrawl.CopyFiles + file_field: audio_filepath + path_to_copy: ${base_dir}/splited_manifests/${lang}_test/ + path_levels: 2 + + - _target_: sdp.processors.datasets.commoncrawl.DropAbsPath + output_manifest_file: ${base_dir}/splited_manifests/${lang}_test.json + path_key: audio_filepath + abs_path_to_drop: ${base_dir}/splited_manifests/ \ No newline at end of file diff --git a/dataset_configs/commoncrawl/big_pl.yaml b/dataset_configs/commoncrawl/big_pl.yaml index 13b0ee0a..84c80c65 100644 --- a/dataset_configs/commoncrawl/big_pl.yaml +++ b/dataset_configs/commoncrawl/big_pl.yaml @@ -135,31 +135,55 @@ processors: rename_fields: {"text_pc":"text"} - _target_: sdp.processors.AddConstantFields + output_manifest_file: ${workspace_dir}/manifest_${lang}.json fields: {"lang": '${lang}'} + - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC + input_manifest_file: ${workspace_dir}/manifest_${lang}.json + output_manifest_file: ${workspace_dir}/manifest_${lang}_train.json + lang: ${lang} + data_split: train + - _target_: sdp.processors.datasets.commoncrawl.CopyFiles file_field: audio_filepath - path_to_copy: ${base_dir}/splited_manifests/${lang}/ + path_to_copy: ${base_dir}/splited_manifests/${lang}_train/ path_levels: 2 - _target_: sdp.processors.datasets.commoncrawl.DropAbsPath - output_manifest_file: ${workspace_dir}/manifest_${lang}.json + output_manifest_file: ${base_dir}/splited_manifests/${lang}_train.json path_key: audio_filepath abs_path_to_drop: ${base_dir}/splited_manifests/ - - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC - output_manifest_file: ${base_dir}/splited_manifests/${lang}_train.json - lang: ${lang} - data_split: train - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC input_manifest_file: ${workspace_dir}/manifest_${lang}.json - output_manifest_file: ${base_dir}/splited_manifests/${lang}_dev.json + output_manifest_file: ${workspace_dir}/manifest_${lang}_dev.json lang: ${lang} data_split: dev + - _target_: sdp.processors.datasets.commoncrawl.CopyFiles + file_field: audio_filepath + path_to_copy: ${base_dir}/splited_manifests/${lang}_dev/ + path_levels: 2 + + - _target_: sdp.processors.datasets.commoncrawl.DropAbsPath + output_manifest_file: ${base_dir}/splited_manifests/${lang}_dev.json + path_key: audio_filepath + abs_path_to_drop: ${base_dir}/splited_manifests/ + + - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC input_manifest_file: ${workspace_dir}/manifest_${lang}.json - output_manifest_file: ${base_dir}/splited_manifests/${lang}_test.json + output_manifest_file: ${workspace_dir}/manifest_${lang}_test.json lang: ${lang} - data_split: test \ No newline at end of file + data_split: test + + - _target_: sdp.processors.datasets.commoncrawl.CopyFiles + file_field: audio_filepath + path_to_copy: ${base_dir}/splited_manifests/${lang}_test/ + path_levels: 2 + + - _target_: sdp.processors.datasets.commoncrawl.DropAbsPath + output_manifest_file: ${base_dir}/splited_manifests/${lang}_test.json + path_key: audio_filepath + abs_path_to_drop: ${base_dir}/splited_manifests/ \ No newline at end of file From 980eeb7f50c71d67458bba29c90d7122904cfb4b Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Mon, 22 Jan 2024 03:19:32 -0800 Subject: [PATCH 067/115] rm Signed-off-by: Nikolay Karpov --- sdp/processors/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py index 1c06a48e..7a3cca35 100644 --- a/sdp/processors/__init__.py +++ b/sdp/processors/__init__.py @@ -32,7 +32,6 @@ NormalizeFromNonPCTextVoxpopuli, ) from sdp.processors.modify_manifest.common import ( - Subprocess, AddConstantFields, ChangeToRelativePath, CombineSources, From 165c29551d2dc19936db287c14ace54d9d5d8df2 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Mon, 22 Jan 2024 04:25:32 -0800 Subject: [PATCH 068/115] black Signed-off-by: Nikolay Karpov --- .../huggingface/speech_recognition.py | 52 +++++++++++-------- 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/sdp/processors/huggingface/speech_recognition.py b/sdp/processors/huggingface/speech_recognition.py index fe575b78..53aa4894 100644 --- a/sdp/processors/huggingface/speech_recognition.py +++ b/sdp/processors/huggingface/speech_recognition.py @@ -13,55 +13,60 @@ # limitations under the License. import json -from tqdm import tqdm from pathlib import Path + +from tqdm import tqdm + from sdp.processors.base_processor import BaseProcessor from sdp.utils.common import load_manifest + class ASRWhisper(BaseProcessor): """ Simple example to transcribe using ASR Whisper model from HuggingFace. There are many ways to improve it: make batch inference, split long files, return predicted language, etc. - + Args: pretrained_model (str): name of pretrained model on HuggingFace. output_text_field (str): field to save transcription result. device (str): Inference device. - batch_size (int): Inference batch size. Defaults to 1. """ + def __init__( self, pretrained_model: str, output_text_field: str, device: str = None, - batch_size: int = 1, + output_lang_field: str = "lid", **kwargs, ): super().__init__(**kwargs) import torch - import whisper # pip install -U openai-whisper + import whisper # pip install -U openai-whisper + self.whisper = whisper self.pretrained_model = pretrained_model self.output_text_field = output_text_field self.device = device - self.batch_size = batch_size + self.output_lang_field = output_lang_field if self.device is None: if torch.cuda.is_available(): self.device = "cuda" else: self.device = "cpu" self.model = whisper.load_model(self.pretrained_model) - + def process(self): json_list = load_manifest(Path(self.input_manifest_file)) - + Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True) - + with Path(self.output_manifest_file).open('w') as f: for item in tqdm(json_list): pred_text, pred_lang = self.whisper_infer(item["audio_filepath"]) item[self.output_text_field] = pred_text + item[self.output_lang_field] = pred_lang f.write(json.dumps(item, ensure_ascii=False) + '\n') def whisper_infer(self, audio_path): @@ -73,15 +78,16 @@ def whisper_infer(self, audio_path): _, probs = self.model.detect_language(mel) lang = max(probs, key=probs.get) - + options = self.whisper.DecodingOptions() result = self.whisper.decode(self.model, mel, options) return result.text, lang - + + class ASRTransformer(BaseProcessor): """ Processor to transcribe using ASR Transformer model from HuggingFace. - + Args: pretrained_model (str): name of pretrained model on HuggingFace. output_text_field (str): field to save transcription result. @@ -89,19 +95,20 @@ class ASRTransformer(BaseProcessor): batch_size (int): Inference batch size. Used only batch_size = 1 TODO: support batch_size > 1 torch_dtype (str): Tensor data type. Default to "float32" """ + def __init__( self, pretrained_model: str, output_text_field: str, device: str = None, - batch_size: int = 1, # TODO: support batch_size > 1 + batch_size: int = 1, torch_dtype: str = "float32", **kwargs, ): super().__init__(**kwargs) import torch from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline - + self.pretrained_model = pretrained_model self.output_text_field = output_text_field self.device = device @@ -118,10 +125,12 @@ def __init__( self.device = "cuda:0" else: self.device = "cpu" - - self.model = AutoModelForSpeechSeq2Seq.from_pretrained(self.pretrained_model, torch_dtype=self.torch_dtype, low_cpu_mem_usage=True, use_safetensors=True) + + self.model = AutoModelForSpeechSeq2Seq.from_pretrained( + self.pretrained_model, torch_dtype=self.torch_dtype, low_cpu_mem_usage=True, use_safetensors=True + ) self.model.to(self.device) - + processor = AutoProcessor.from_pretrained(self.pretrained_model) self.pipe = pipeline( "automatic-speech-recognition", @@ -130,21 +139,20 @@ def __init__( feature_extractor=processor.feature_extractor, max_new_tokens=128, chunk_length_s=30, - batch_size=16, + batch_size=self.batch_size, return_timestamps=True, torch_dtype=torch_dtype, device=self.device, ) def process(self): - json_list = load_manifest(Path(self.input_manifest_file)) - + Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True) - + with Path(self.output_manifest_file).open('w') as f: for item in tqdm(json_list): pred_text = self.pipe(item["audio_filepath"])["text"] item[self.output_text_field] = pred_text - f.write(json.dumps(item, ensure_ascii=False) + '\n') \ No newline at end of file + f.write(json.dumps(item, ensure_ascii=False) + '\n') From 8cd5896eac1697d820ebbe1b704c22b421b3400b Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Mon, 22 Jan 2024 05:18:34 -0800 Subject: [PATCH 069/115] self.torch_dtype Signed-off-by: Nikolay Karpov --- sdp/processors/huggingface/speech_recognition.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdp/processors/huggingface/speech_recognition.py b/sdp/processors/huggingface/speech_recognition.py index 53aa4894..4112112e 100644 --- a/sdp/processors/huggingface/speech_recognition.py +++ b/sdp/processors/huggingface/speech_recognition.py @@ -141,7 +141,7 @@ def __init__( chunk_length_s=30, batch_size=self.batch_size, return_timestamps=True, - torch_dtype=torch_dtype, + torch_dtype=self.torch_dtype, device=self.device, ) From 43ff82d70167af85b5773564d3491fa15d1ef8a8 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Mon, 22 Jan 2024 06:50:05 -0800 Subject: [PATCH 070/115] mv to cv Signed-off-by: Nikolay Karpov --- dataset_configs/armenian/{text.yaml => text_cv.yaml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename dataset_configs/armenian/{text.yaml => text_cv.yaml} (100%) diff --git a/dataset_configs/armenian/text.yaml b/dataset_configs/armenian/text_cv.yaml similarity index 100% rename from dataset_configs/armenian/text.yaml rename to dataset_configs/armenian/text_cv.yaml From a24e6c93a0000bf493f4b80de3d186b10d1e143f Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Mon, 22 Jan 2024 07:02:54 -0800 Subject: [PATCH 071/115] mv configs Signed-off-by: Nikolay Karpov --- .../armenian/{audio_books.yaml => audio_books/config.yaml} | 0 dataset_configs/armenian/{mcv.yaml => mcv/config.yaml} | 0 dataset_configs/armenian/{text_cv.yaml => text_cv/config.yaml} | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename dataset_configs/armenian/{audio_books.yaml => audio_books/config.yaml} (100%) rename dataset_configs/armenian/{mcv.yaml => mcv/config.yaml} (100%) rename dataset_configs/armenian/{text_cv.yaml => text_cv/config.yaml} (100%) diff --git a/dataset_configs/armenian/audio_books.yaml b/dataset_configs/armenian/audio_books/config.yaml similarity index 100% rename from dataset_configs/armenian/audio_books.yaml rename to dataset_configs/armenian/audio_books/config.yaml diff --git a/dataset_configs/armenian/mcv.yaml b/dataset_configs/armenian/mcv/config.yaml similarity index 100% rename from dataset_configs/armenian/mcv.yaml rename to dataset_configs/armenian/mcv/config.yaml diff --git a/dataset_configs/armenian/text_cv.yaml b/dataset_configs/armenian/text_cv/config.yaml similarity index 100% rename from dataset_configs/armenian/text_cv.yaml rename to dataset_configs/armenian/text_cv/config.yaml From 157de3ad049e9fefa71c43537e4b39d6c5b254c1 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Tue, 23 Jan 2024 01:18:41 -0800 Subject: [PATCH 072/115] rename Signed-off-by: Nikolay Karpov --- dataset_configs/armenian/{text_cv => text_mcv}/config.yaml | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename dataset_configs/armenian/{text_cv => text_mcv}/config.yaml (100%) diff --git a/dataset_configs/armenian/text_cv/config.yaml b/dataset_configs/armenian/text_mcv/config.yaml similarity index 100% rename from dataset_configs/armenian/text_cv/config.yaml rename to dataset_configs/armenian/text_mcv/config.yaml From af69829629cee3b46c9e29f854ca8c68e47bceab Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Thu, 25 Jan 2024 21:44:13 -0800 Subject: [PATCH 073/115] ManifestToUtf8 Signed-off-by: Nikolay Karpov --- dataset_configs/commoncrawl/big_de.yaml | 13 ++++++++++--- dataset_configs/commoncrawl/big_en.yaml | 13 +++++++++---- dataset_configs/commoncrawl/big_fr.yaml | 14 +++++++++++--- dataset_configs/commoncrawl/big_pl.yaml | 13 ++++++++++--- sdp/processors/datasets/commoncrawl/__init__.py | 2 +- sdp/processors/datasets/commoncrawl/commoncrawl.py | 9 +++++++++ 6 files changed, 50 insertions(+), 14 deletions(-) diff --git a/dataset_configs/commoncrawl/big_de.yaml b/dataset_configs/commoncrawl/big_de.yaml index 2f525a21..82fb85c7 100644 --- a/dataset_configs/commoncrawl/big_de.yaml +++ b/dataset_configs/commoncrawl/big_de.yaml @@ -148,12 +148,21 @@ processors: - _target_: sdp.processors.RenameFields rename_fields: {"text_pc":"text"} + - _target_: sdp.processors.SubRegex + text_key: text + regex_params_list: + - {"pattern": "\\s+\\?", "repl": "?"} + - {"pattern": "\\s+\\.", "repl": "."} + - {"pattern": "\\s+,", "repl": ","} + - {"pattern": "\\s+", "repl": " "} + + - _target_: sdp.processors.datasets.commoncrawl.ManifestToUtf8 + - _target_: sdp.processors.AddConstantFields output_manifest_file: ${workspace_dir}/manifest_${lang}.json fields: {"lang": '${lang}'} - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC - input_manifest_file: ${workspace_dir}/manifest_${lang}.json output_manifest_file: ${workspace_dir}/manifest_${lang}_train.json lang: ${lang} data_split: train @@ -168,7 +177,6 @@ processors: path_key: audio_filepath abs_path_to_drop: ${base_dir}/splited_manifests/ - - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC input_manifest_file: ${workspace_dir}/manifest_${lang}.json output_manifest_file: ${workspace_dir}/manifest_${lang}_dev.json @@ -185,7 +193,6 @@ processors: path_key: audio_filepath abs_path_to_drop: ${base_dir}/splited_manifests/ - - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC input_manifest_file: ${workspace_dir}/manifest_${lang}.json output_manifest_file: ${workspace_dir}/manifest_${lang}_test.json diff --git a/dataset_configs/commoncrawl/big_en.yaml b/dataset_configs/commoncrawl/big_en.yaml index 17dd358a..bc755739 100644 --- a/dataset_configs/commoncrawl/big_en.yaml +++ b/dataset_configs/commoncrawl/big_en.yaml @@ -305,12 +305,19 @@ processors: - _target_: sdp.processors.RenameFields rename_fields: {"text_pc":"text"} + - _target_: sdp.processors.SubRegex + text_key: text + regex_params_list: + - {"pattern": "\\s+\\?", "repl": "?"} + - {"pattern": "\\s+\\.", "repl": "."} + - {"pattern": "\\s+,", "repl": ","} + - {"pattern": "\\s+", "repl": " "} + - _target_: sdp.processors.AddConstantFields output_manifest_file: ${workspace_dir}/manifest_${lang}.json fields: {"lang": '${lang}'} - + - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC - input_manifest_file: ${workspace_dir}/manifest_${lang}.json output_manifest_file: ${workspace_dir}/manifest_${lang}_train.json lang: ${lang} data_split: train @@ -325,7 +332,6 @@ processors: path_key: audio_filepath abs_path_to_drop: ${base_dir}/splited_manifests/ - - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC input_manifest_file: ${workspace_dir}/manifest_${lang}.json output_manifest_file: ${workspace_dir}/manifest_${lang}_dev.json @@ -342,7 +348,6 @@ processors: path_key: audio_filepath abs_path_to_drop: ${base_dir}/splited_manifests/ - - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC input_manifest_file: ${workspace_dir}/manifest_${lang}.json output_manifest_file: ${workspace_dir}/manifest_${lang}_test.json diff --git a/dataset_configs/commoncrawl/big_fr.yaml b/dataset_configs/commoncrawl/big_fr.yaml index ffbbbc7f..92e958b1 100644 --- a/dataset_configs/commoncrawl/big_fr.yaml +++ b/dataset_configs/commoncrawl/big_fr.yaml @@ -153,12 +153,22 @@ processors: - _target_: sdp.processors.RenameFields rename_fields: {"text_pc":"text"} + - _target_: sdp.processors.SubRegex + input_manifest_file: ${workspace_dir}/manifest_${lang}.json + text_key: text + regex_params_list: + - {"pattern": "\\s+\\?", "repl": "?"} + - {"pattern": "\\s+\\.", "repl": "."} + - {"pattern": "\\s+,", "repl": ","} + - {"pattern": "\\s+", "repl": " "} + + - _target_: sdp.processors.datasets.commoncrawl.ManifestToUtf8 + - _target_: sdp.processors.AddConstantFields output_manifest_file: ${workspace_dir}/manifest_${lang}.json fields: {"lang": '${lang}'} - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC - input_manifest_file: ${workspace_dir}/manifest_${lang}.json output_manifest_file: ${workspace_dir}/manifest_${lang}_train.json lang: ${lang} data_split: train @@ -173,7 +183,6 @@ processors: path_key: audio_filepath abs_path_to_drop: ${base_dir}/splited_manifests/ - - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC input_manifest_file: ${workspace_dir}/manifest_${lang}.json output_manifest_file: ${workspace_dir}/manifest_${lang}_dev.json @@ -190,7 +199,6 @@ processors: path_key: audio_filepath abs_path_to_drop: ${base_dir}/splited_manifests/ - - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC input_manifest_file: ${workspace_dir}/manifest_${lang}.json output_manifest_file: ${workspace_dir}/manifest_${lang}_test.json diff --git a/dataset_configs/commoncrawl/big_pl.yaml b/dataset_configs/commoncrawl/big_pl.yaml index 84c80c65..ec1d6d96 100644 --- a/dataset_configs/commoncrawl/big_pl.yaml +++ b/dataset_configs/commoncrawl/big_pl.yaml @@ -134,12 +134,21 @@ processors: - _target_: sdp.processors.RenameFields rename_fields: {"text_pc":"text"} + - _target_: sdp.processors.SubRegex + text_key: text + regex_params_list: + - {"pattern": "\\s+\\?", "repl": "?"} + - {"pattern": "\\s+\\.", "repl": "."} + - {"pattern": "\\s+,", "repl": ","} + - {"pattern": "\\s+", "repl": " "} + + - _target_: sdp.processors.datasets.commoncrawl.ManifestToUtf8 + - _target_: sdp.processors.AddConstantFields output_manifest_file: ${workspace_dir}/manifest_${lang}.json fields: {"lang": '${lang}'} - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC - input_manifest_file: ${workspace_dir}/manifest_${lang}.json output_manifest_file: ${workspace_dir}/manifest_${lang}_train.json lang: ${lang} data_split: train @@ -154,7 +163,6 @@ processors: path_key: audio_filepath abs_path_to_drop: ${base_dir}/splited_manifests/ - - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC input_manifest_file: ${workspace_dir}/manifest_${lang}.json output_manifest_file: ${workspace_dir}/manifest_${lang}_dev.json @@ -171,7 +179,6 @@ processors: path_key: audio_filepath abs_path_to_drop: ${base_dir}/splited_manifests/ - - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC input_manifest_file: ${workspace_dir}/manifest_${lang}.json output_manifest_file: ${workspace_dir}/manifest_${lang}_test.json diff --git a/sdp/processors/datasets/commoncrawl/__init__.py b/sdp/processors/datasets/commoncrawl/__init__.py index 513c4147..b4fe3020 100644 --- a/sdp/processors/datasets/commoncrawl/__init__.py +++ b/sdp/processors/datasets/commoncrawl/__init__.py @@ -16,4 +16,4 @@ Lang2Iso, SplitByVttSentence, SplitByVtt, AudioLid, TextLid, AllVttText, TxtToVtt, \ ReadParquet, CreateInitialManifestCC, FfmpegConvert, ASR_HF, AlignerSubprocess, \ SplitByAligner, JoinBy, EvalBandwidth, CreateInitialManifestExt, AudioDuration, \ - TrainDevTestSplitCC, DropAbsPath, GetSpecificFiles, CopyFiles + TrainDevTestSplitCC, DropAbsPath, GetSpecificFiles, CopyFiles, ManifestToUtf8 diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py index a6bfd134..5a9b7f76 100644 --- a/sdp/processors/datasets/commoncrawl/commoncrawl.py +++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py @@ -19,6 +19,15 @@ from sdp.processors.datasets.commoncrawl.harv_utils import ffmpeg_convert, txt2vtt, make_trans_list, get_vtt_text, text2lid, load_manifest, read_jsonl, write_jsonl, split_by_vtt_new, audio_duration from scipy.spatial import distance +class ManifestToUtf8(BaseProcessor): + """ + Processor to convert manifest file to UTF-8 encoding. + """ + def process(self): + with open(self.output_manifest_file, "w") as wout, open(self.input_manifest_file) as win: + for line in win: + print(json.dumps(json.loads(line), ensure_ascii=False), file=wout) + class DropAbsPath(BaseParallelProcessor): """ Drop absolute path From 4c8a21006a9c38a8e8c4a617d235911a4a9f6fb3 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Thu, 8 Feb 2024 01:23:51 -0800 Subject: [PATCH 074/115] black Signed-off-by: Nikolay Karpov --- dataset_configs/armenian/text_mcv/config.yaml | 4 ++-- tests/test_cfg_end_to_end_tests.py | 11 +++++++---- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/dataset_configs/armenian/text_mcv/config.yaml b/dataset_configs/armenian/text_mcv/config.yaml index 0939d55f..241de972 100644 --- a/dataset_configs/armenian/text_mcv/config.yaml +++ b/dataset_configs/armenian/text_mcv/config.yaml @@ -1,7 +1,7 @@ # Processing pipeline for text corpus to submit to Common Voice https://common-voice.github.io/community-playbook/sub_pages/text.html processors_to_run: "0:" workspace_dir: ??? -final_manifest: ${workspace_dir}/final_manifest.json +final_manifest: ${workspace_dir}/manifest12.json processors: - _target_: sdp.processors.CreateInitialManifestByExt @@ -92,7 +92,7 @@ processors: rename_fields: {"text": "Sentence"} - _target_: sdp.processors.KeepOnlySpecifiedFields - output_manifest_file: ${final_manifest} + output_manifest_file: ${workspace_dir}/manifest12.json fields_to_keep: ["Sentence", "Source"] - _target_: sdp.processors.langs.armenian.MakeTsv diff --git a/tests/test_cfg_end_to_end_tests.py b/tests/test_cfg_end_to_end_tests.py index e11d32d7..3719ed79 100644 --- a/tests/test_cfg_end_to_end_tests.py +++ b/tests/test_cfg_end_to_end_tests.py @@ -14,10 +14,10 @@ import json import os +import shutil import tarfile from functools import partial from pathlib import Path -import shutil from typing import Callable from unittest import mock @@ -88,6 +88,7 @@ def get_test_cases(): # audio will be downloaded on the fly from a subset of files. # No checks, but need to mock the url list function (done above) (f"{DATASET_CONFIGS_ROOT}/english/coraal/config.yaml", lambda raw_data_dir: True), + (f"{DATASET_CONFIGS_ROOT}/armenian/text_mcv/config.yaml", lambda raw_data_dir: True), ] @@ -157,7 +158,8 @@ def test_configs(config_path: str, data_check_fn: Callable, tmp_path: str): assert "processors" in cfg cfg["processors_to_run"] = "all" cfg["workspace_dir"] = str(tmp_path) - cfg["final_manifest"] = str(tmp_path / "final_manifest.json") + if not "final_manifest" in cfg: + cfg["final_manifest"] = str(tmp_path / "final_manifest.json") cfg["data_split"] = "train" cfg["processors"][0]["raw_data_dir"] = str(Path(test_data_root) / rel_path_from_root) @@ -174,8 +176,9 @@ def test_configs(config_path: str, data_check_fn: Callable, tmp_path: str): for reference_line, generated_line in zip(reference_lines, generated_lines): reference_data = json.loads(reference_line) generated_data = json.loads(generated_line) - reference_data.pop("audio_filepath") - generated_data.pop("audio_filepath") + if "audio_filepath" in reference_data: + reference_data.pop("audio_filepath") + generated_data.pop("audio_filepath") assert reference_data == generated_data # if CLEAN_UP_TMP_PATH is set to non-0 value, we will delete tmp_path From b1b45bc07f08a968deaa22066631b0f1329d4628 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Thu, 8 Feb 2024 01:50:50 -0800 Subject: [PATCH 075/115] not in Signed-off-by: Nikolay Karpov --- tests/test_cfg_end_to_end_tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_cfg_end_to_end_tests.py b/tests/test_cfg_end_to_end_tests.py index 3719ed79..a8977454 100644 --- a/tests/test_cfg_end_to_end_tests.py +++ b/tests/test_cfg_end_to_end_tests.py @@ -158,7 +158,7 @@ def test_configs(config_path: str, data_check_fn: Callable, tmp_path: str): assert "processors" in cfg cfg["processors_to_run"] = "all" cfg["workspace_dir"] = str(tmp_path) - if not "final_manifest" in cfg: + if "final_manifest" not in cfg: cfg["final_manifest"] = str(tmp_path / "final_manifest.json") cfg["data_split"] = "train" cfg["processors"][0]["raw_data_dir"] = str(Path(test_data_root) / rel_path_from_root) From fc30b34366b1b648856e51f436190e6b1aa6ee65 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Fri, 9 Feb 2024 03:02:09 -0800 Subject: [PATCH 076/115] black Signed-off-by: Nikolay Karpov --- dataset_configs/armenian/mcv/config.yaml | 45 ----- sdp/processors/__init__.py | 19 +-- .../huggingface/speech_recognition.py | 158 ------------------ 3 files changed, 7 insertions(+), 215 deletions(-) delete mode 100644 dataset_configs/armenian/mcv/config.yaml delete mode 100644 sdp/processors/huggingface/speech_recognition.py diff --git a/dataset_configs/armenian/mcv/config.yaml b/dataset_configs/armenian/mcv/config.yaml deleted file mode 100644 index 77a28f35..00000000 --- a/dataset_configs/armenian/mcv/config.yaml +++ /dev/null @@ -1,45 +0,0 @@ -processors_to_run: "0:" -workspace_dir: ??? -final_manifest: ${workspace_dir}/final_manifest.json - -processors: - - _target_: sdp.processors.CreateInitialManifestMCV # conda install -c conda-forge -y sox libvorbis - raw_data_dir: ${workspace_dir} - extract_archive_dir: ${workspace_dir}/row - resampled_audio_dir: ${workspace_dir}/16k - data_split: train - language_id: cv-corpus-15.0-2023-09-08-hy-AM - output_manifest_file: ${workspace_dir}/manifest0.json - - - _target_: sdp.processors.ASRWhisper - output_manifest_file: ${workspace_dir}/manifest1.json - pretrained_model: "large-v2" - output_text_field: pred_text - - - _target_: sdp.processors.DropHighWER - text_key: text - pred_text_key: pred_text - wer_threshold: 75 - - - _target_: sdp.processors.DropHighCER - output_manifest_file: ${workspace_dir}/manifest3.json - text_key: text - pred_text_key: pred_text - cer_threshold: 30 - - - _target_: sdp.processors.ASRTransformer #pip install accelerate - input_manifest_file: ${workspace_dir}/manifest1.json - output_manifest_file: ${workspace_dir}/manifest4.json - pretrained_model: "distil-whisper/distil-large-v2" #"openai/whisper-large-v3" - output_text_field: pred_text3 - - - _target_: sdp.processors.DropHighWER - text_key: text - pred_text_key: pred_text3 - wer_threshold: 75 - - - _target_: sdp.processors.DropHighCER - output_manifest_file: ${final_manifest} - text_key: text - pred_text_key: pred_text3 - cer_threshold: 30 \ No newline at end of file diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py index 617c6c4b..2502aa25 100644 --- a/sdp/processors/__init__.py +++ b/sdp/processors/__init__.py @@ -18,6 +18,7 @@ CreateInitialManifestCORAAL, TrainDevTestSplitCORAAL, ) +from sdp.processors.datasets.lhotse import LhotseImport from sdp.processors.datasets.mcv.create_initial_manifest import CreateInitialManifestMCV from sdp.processors.datasets.mls.create_initial_manifest import CreateInitialManifestMLS from sdp.processors.datasets.mls.restore_pc import RestorePCForMLS @@ -31,30 +32,29 @@ from sdp.processors.datasets.voxpopuli.normalize_from_non_pc_text import ( NormalizeFromNonPCTextVoxpopuli, ) -from sdp.processors.datasets.lhotse import LhotseImport from sdp.processors.modify_manifest.common import ( AddConstantFields, ChangeToRelativePath, CombineSources, DuplicateFields, + KeepOnlySpecifiedFields, RenameFields, SortManifest, SplitOnFixedDuration, - KeepOnlySpecifiedFields, ) +from sdp.processors.modify_manifest.create_manifest import CreateInitialManifestByExt from sdp.processors.modify_manifest.data_to_data import ( - GetAudioDuration, + CountNumWords, FfmpegConvert, + GetAudioDuration, + InsIfASRInsertion, ReadTxtLines, SplitLineBySentence, - CountNumWords, - InsIfASRInsertion, SubIfASRSubstitution, SubMakeLowercase, SubRegex, ) from sdp.processors.modify_manifest.data_to_dropbool import ( - PreserveByValue, DropASRError, DropASRErrorBeginningEnd, DropHighCER, @@ -68,15 +68,10 @@ DropLowWordMatchRate, DropNonAlphabet, DropOnAttribute, + PreserveByValue, ) from sdp.processors.modify_manifest.make_letters_uppercase_after_period import ( MakeLettersUppercaseAfterPeriod, ) from sdp.processors.nemo.asr_inference import ASRInference from sdp.processors.nemo.pc_inference import PCInference - -from sdp.processors.huggingface.speech_recognition import ( - ASRTransformer, - ASRWhisper, -) -from sdp.processors.modify_manifest.create_manifest import CreateInitialManifestByExt \ No newline at end of file diff --git a/sdp/processors/huggingface/speech_recognition.py b/sdp/processors/huggingface/speech_recognition.py deleted file mode 100644 index 4112112e..00000000 --- a/sdp/processors/huggingface/speech_recognition.py +++ /dev/null @@ -1,158 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -from pathlib import Path - -from tqdm import tqdm - -from sdp.processors.base_processor import BaseProcessor -from sdp.utils.common import load_manifest - - -class ASRWhisper(BaseProcessor): - """ - Simple example to transcribe using ASR Whisper model from HuggingFace. - There are many ways to improve it: make batch inference, split long files, return predicted language, etc. - - Args: - pretrained_model (str): name of pretrained model on HuggingFace. - output_text_field (str): field to save transcription result. - device (str): Inference device. - """ - - def __init__( - self, - pretrained_model: str, - output_text_field: str, - device: str = None, - output_lang_field: str = "lid", - **kwargs, - ): - super().__init__(**kwargs) - import torch - import whisper # pip install -U openai-whisper - - self.whisper = whisper - self.pretrained_model = pretrained_model - self.output_text_field = output_text_field - self.device = device - self.output_lang_field = output_lang_field - if self.device is None: - if torch.cuda.is_available(): - self.device = "cuda" - else: - self.device = "cpu" - self.model = whisper.load_model(self.pretrained_model) - - def process(self): - json_list = load_manifest(Path(self.input_manifest_file)) - - Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True) - - with Path(self.output_manifest_file).open('w') as f: - for item in tqdm(json_list): - pred_text, pred_lang = self.whisper_infer(item["audio_filepath"]) - - item[self.output_text_field] = pred_text - item[self.output_lang_field] = pred_lang - f.write(json.dumps(item, ensure_ascii=False) + '\n') - - def whisper_infer(self, audio_path): - audio = self.whisper.load_audio(audio_path) - - audio = self.whisper.pad_or_trim(audio) - mel = self.whisper.log_mel_spectrogram(audio) - mel = mel.to(self.device) - - _, probs = self.model.detect_language(mel) - lang = max(probs, key=probs.get) - - options = self.whisper.DecodingOptions() - result = self.whisper.decode(self.model, mel, options) - return result.text, lang - - -class ASRTransformer(BaseProcessor): - """ - Processor to transcribe using ASR Transformer model from HuggingFace. - - Args: - pretrained_model (str): name of pretrained model on HuggingFace. - output_text_field (str): field to save transcription result. - device (str): Inference device. - batch_size (int): Inference batch size. Used only batch_size = 1 TODO: support batch_size > 1 - torch_dtype (str): Tensor data type. Default to "float32" - """ - - def __init__( - self, - pretrained_model: str, - output_text_field: str, - device: str = None, - batch_size: int = 1, - torch_dtype: str = "float32", - **kwargs, - ): - super().__init__(**kwargs) - import torch - from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline - - self.pretrained_model = pretrained_model - self.output_text_field = output_text_field - self.device = device - self.batch_size = batch_size - if torch_dtype == "float32": - self.torch_dtype = torch.float32 - elif torch_dtype == "float16": - self.torch_dtype = torch.float16 - else: - raise NotImplementedError(torch_dtype + " is not implemented!") - - if self.device is None: - if torch.cuda.is_available(): - self.device = "cuda:0" - else: - self.device = "cpu" - - self.model = AutoModelForSpeechSeq2Seq.from_pretrained( - self.pretrained_model, torch_dtype=self.torch_dtype, low_cpu_mem_usage=True, use_safetensors=True - ) - self.model.to(self.device) - - processor = AutoProcessor.from_pretrained(self.pretrained_model) - self.pipe = pipeline( - "automatic-speech-recognition", - model=self.model, - tokenizer=processor.tokenizer, - feature_extractor=processor.feature_extractor, - max_new_tokens=128, - chunk_length_s=30, - batch_size=self.batch_size, - return_timestamps=True, - torch_dtype=self.torch_dtype, - device=self.device, - ) - - def process(self): - json_list = load_manifest(Path(self.input_manifest_file)) - - Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True) - - with Path(self.output_manifest_file).open('w') as f: - for item in tqdm(json_list): - pred_text = self.pipe(item["audio_filepath"])["text"] - - item[self.output_text_field] = pred_text - f.write(json.dumps(item, ensure_ascii=False) + '\n') From ee1c52e9bc3d2f828b1e164619f7858f4d2cb407 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Fri, 9 Feb 2024 03:07:30 -0800 Subject: [PATCH 077/115] add ASRWhisper Signed-off-by: Nikolay Karpov --- sdp/processors/__init__.py | 1 + .../huggingface/speech_recognition.py | 158 ++++++++++++++++++ 2 files changed, 159 insertions(+) create mode 100644 sdp/processors/huggingface/speech_recognition.py diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py index 2502aa25..2b834bf9 100644 --- a/sdp/processors/__init__.py +++ b/sdp/processors/__init__.py @@ -32,6 +32,7 @@ from sdp.processors.datasets.voxpopuli.normalize_from_non_pc_text import ( NormalizeFromNonPCTextVoxpopuli, ) +from sdp.processors.huggingface.speech_recognition import ASRTransformer, ASRWhisper from sdp.processors.modify_manifest.common import ( AddConstantFields, ChangeToRelativePath, diff --git a/sdp/processors/huggingface/speech_recognition.py b/sdp/processors/huggingface/speech_recognition.py new file mode 100644 index 00000000..4112112e --- /dev/null +++ b/sdp/processors/huggingface/speech_recognition.py @@ -0,0 +1,158 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +from pathlib import Path + +from tqdm import tqdm + +from sdp.processors.base_processor import BaseProcessor +from sdp.utils.common import load_manifest + + +class ASRWhisper(BaseProcessor): + """ + Simple example to transcribe using ASR Whisper model from HuggingFace. + There are many ways to improve it: make batch inference, split long files, return predicted language, etc. + + Args: + pretrained_model (str): name of pretrained model on HuggingFace. + output_text_field (str): field to save transcription result. + device (str): Inference device. + """ + + def __init__( + self, + pretrained_model: str, + output_text_field: str, + device: str = None, + output_lang_field: str = "lid", + **kwargs, + ): + super().__init__(**kwargs) + import torch + import whisper # pip install -U openai-whisper + + self.whisper = whisper + self.pretrained_model = pretrained_model + self.output_text_field = output_text_field + self.device = device + self.output_lang_field = output_lang_field + if self.device is None: + if torch.cuda.is_available(): + self.device = "cuda" + else: + self.device = "cpu" + self.model = whisper.load_model(self.pretrained_model) + + def process(self): + json_list = load_manifest(Path(self.input_manifest_file)) + + Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True) + + with Path(self.output_manifest_file).open('w') as f: + for item in tqdm(json_list): + pred_text, pred_lang = self.whisper_infer(item["audio_filepath"]) + + item[self.output_text_field] = pred_text + item[self.output_lang_field] = pred_lang + f.write(json.dumps(item, ensure_ascii=False) + '\n') + + def whisper_infer(self, audio_path): + audio = self.whisper.load_audio(audio_path) + + audio = self.whisper.pad_or_trim(audio) + mel = self.whisper.log_mel_spectrogram(audio) + mel = mel.to(self.device) + + _, probs = self.model.detect_language(mel) + lang = max(probs, key=probs.get) + + options = self.whisper.DecodingOptions() + result = self.whisper.decode(self.model, mel, options) + return result.text, lang + + +class ASRTransformer(BaseProcessor): + """ + Processor to transcribe using ASR Transformer model from HuggingFace. + + Args: + pretrained_model (str): name of pretrained model on HuggingFace. + output_text_field (str): field to save transcription result. + device (str): Inference device. + batch_size (int): Inference batch size. Used only batch_size = 1 TODO: support batch_size > 1 + torch_dtype (str): Tensor data type. Default to "float32" + """ + + def __init__( + self, + pretrained_model: str, + output_text_field: str, + device: str = None, + batch_size: int = 1, + torch_dtype: str = "float32", + **kwargs, + ): + super().__init__(**kwargs) + import torch + from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline + + self.pretrained_model = pretrained_model + self.output_text_field = output_text_field + self.device = device + self.batch_size = batch_size + if torch_dtype == "float32": + self.torch_dtype = torch.float32 + elif torch_dtype == "float16": + self.torch_dtype = torch.float16 + else: + raise NotImplementedError(torch_dtype + " is not implemented!") + + if self.device is None: + if torch.cuda.is_available(): + self.device = "cuda:0" + else: + self.device = "cpu" + + self.model = AutoModelForSpeechSeq2Seq.from_pretrained( + self.pretrained_model, torch_dtype=self.torch_dtype, low_cpu_mem_usage=True, use_safetensors=True + ) + self.model.to(self.device) + + processor = AutoProcessor.from_pretrained(self.pretrained_model) + self.pipe = pipeline( + "automatic-speech-recognition", + model=self.model, + tokenizer=processor.tokenizer, + feature_extractor=processor.feature_extractor, + max_new_tokens=128, + chunk_length_s=30, + batch_size=self.batch_size, + return_timestamps=True, + torch_dtype=self.torch_dtype, + device=self.device, + ) + + def process(self): + json_list = load_manifest(Path(self.input_manifest_file)) + + Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True) + + with Path(self.output_manifest_file).open('w') as f: + for item in tqdm(json_list): + pred_text = self.pipe(item["audio_filepath"])["text"] + + item[self.output_text_field] = pred_text + f.write(json.dumps(item, ensure_ascii=False) + '\n') From 3dcc2f799cf3704842f1689ff81d9f02cc6d6f78 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Fri, 9 Feb 2024 03:16:24 -0800 Subject: [PATCH 078/115] requirements Signed-off-by: Nikolay Karpov --- dataset_configs/armenian/audio_books/config.yaml | 9 +++++++-- requirements/main.txt | 3 +++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/dataset_configs/armenian/audio_books/config.yaml b/dataset_configs/armenian/audio_books/config.yaml index f50c2f41..bc0fcf52 100644 --- a/dataset_configs/armenian/audio_books/config.yaml +++ b/dataset_configs/armenian/audio_books/config.yaml @@ -28,12 +28,17 @@ processors: pretrained_model: "large-v2" output_text_field: text - - _target_: sdp.processors.SubMakeLowercase + - _target_: sdp.processors.ASRTransformer #pip install accelerate output_manifest_file: ${workspace_dir}/manifest4.json + pretrained_model: "distil-whisper/distil-large-v2" #"openai/whisper-large-v3" + output_text_field: pred_text + + - _target_: sdp.processors.SubMakeLowercase + output_manifest_file: ${workspace_dir}/manifest5.json text_key: "text" - _target_: sdp.processors.DropNonAlphabet - output_manifest_file: ${workspace_dir}/manifest5.json + output_manifest_file: ${workspace_dir}/manifest6.json alphabet: "ԱԲԳԴԵԶԷԸԹԺԻԼԽԾԿՀՁՂՃՄՅՆՇՈՉՊՋՌՍՎՏՐՑՒՓՔՕՖՈՒԵ աբգդեզէըթժիլխծկհձղճմյնշոչպջռսվտրցւփքօֆուև.,!?" test_cases: - {input: {text: "test тест Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}, output: null} diff --git a/requirements/main.txt b/requirements/main.txt index 4e5c79fb..0f829728 100644 --- a/requirements/main.txt +++ b/requirements/main.txt @@ -1,3 +1,4 @@ +accelerate diff_match_patch editdistance hydra-core @@ -5,9 +6,11 @@ joblib librosa>=0.10.0 # specify >=0.10.0 so that librosa.get_duration(path=...) will work numpy omegaconf +openai-whisper pandas regex sox tqdm +transformers wget # for some processers, additionally https://github.com/NVIDIA/NeMo is required From 61c8fe71c5c21f8200c949cb2d561a2750936666 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Fri, 9 Feb 2024 05:03:00 -0800 Subject: [PATCH 079/115] test audio_books.yaml Signed-off-by: Nikolay Karpov --- dataset_configs/armenian/audio_books/config.yaml | 2 +- tests/test_cfg_end_to_end_tests.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/dataset_configs/armenian/audio_books/config.yaml b/dataset_configs/armenian/audio_books/config.yaml index bc0fcf52..14cd41b3 100644 --- a/dataset_configs/armenian/audio_books/config.yaml +++ b/dataset_configs/armenian/audio_books/config.yaml @@ -28,7 +28,7 @@ processors: pretrained_model: "large-v2" output_text_field: text - - _target_: sdp.processors.ASRTransformer #pip install accelerate + - _target_: sdp.processors.ASRTransformer output_manifest_file: ${workspace_dir}/manifest4.json pretrained_model: "distil-whisper/distil-large-v2" #"openai/whisper-large-v3" output_text_field: pred_text diff --git a/tests/test_cfg_end_to_end_tests.py b/tests/test_cfg_end_to_end_tests.py index a8977454..84dbb1f4 100644 --- a/tests/test_cfg_end_to_end_tests.py +++ b/tests/test_cfg_end_to_end_tests.py @@ -89,6 +89,7 @@ def get_test_cases(): # No checks, but need to mock the url list function (done above) (f"{DATASET_CONFIGS_ROOT}/english/coraal/config.yaml", lambda raw_data_dir: True), (f"{DATASET_CONFIGS_ROOT}/armenian/text_mcv/config.yaml", lambda raw_data_dir: True), + (f"{DATASET_CONFIGS_ROOT}/armenian/audio_books/config.yaml", lambda raw_data_dir: True), ] From f7182f2e5fe59cd3594e95ce3ee3c49f17981034 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Sun, 25 Feb 2024 22:52:43 -0800 Subject: [PATCH 080/115] add docs Signed-off-by: Nikolay Karpov --- .../armenian/audio_books/config.yaml | 57 ++++++++++++++++--- dataset_configs/armenian/text_mcv/config.yaml | 39 ++++++++++++- 2 files changed, 87 insertions(+), 9 deletions(-) diff --git a/dataset_configs/armenian/audio_books/config.yaml b/dataset_configs/armenian/audio_books/config.yaml index 14cd41b3..db33767e 100644 --- a/dataset_configs/armenian/audio_books/config.yaml +++ b/dataset_configs/armenian/audio_books/config.yaml @@ -1,3 +1,38 @@ +documentation: | + Audio books + ###### + + This config can be used as example to process audiobooks in Armenian language and prepare + dataset in the NeMo format. + + This config performs the following data processing. + + 1. Create initial manifest by collecling all avalible files with mp3 expention in raw_data_dir folder. + + 2. Convert mp3 into wav format using the Ffmpeg suite, with a downsampling to a 16000 Hz sample rate + and a unification of all audio channels into a mono track. + 3. Count duration for audio files in seconds and save it into duration field. + 4. Filter out broken files with duration shorter than 0 seconds. + You can directly change the config file to control this. + 5. Predict transcription using large-v2 Whisper ASR model into text field. + 6. Predict transcription using distil-whisper/distil-large-v2 transformers ASR model into pred_text field. + 7. Drops everything with non-armenean characters. + 8. Normalise some text examples with SubRegex. + + **Required arguments**. + * **workspace_dir**: specify the workspace folder where all audio files will be stored. + + Note that you can customize any part of this config either directly or from command-line. + + **Output format**. + * ``${workspace_dir}/final_manifest.json`` - final_manifest manifest with all the data. + + Output manifest contain the following fields: + * **audio_filepath (str)**: relative path to the audio files. + * **text (str)**: transcription predicted by Whisper (Upper-case with punctuation). + * **pred_text (str)**: transcription predicted by Transformers (Upper-case without punctuation). + * **duration (float)**: audio duration in seconds. + processors_to_run: "0:" workspace_dir: ??? final_manifest: ${workspace_dir}/final_manifest.json @@ -22,21 +57,23 @@ processors: audio_filepath_field: audio_filepath duration_field: duration output_manifest_file: ${workspace_dir}/manifest2.json - - - _target_: sdp.processors.ASRWhisper + + - _target_: sdp.processors.PreserveByValue output_manifest_file: ${workspace_dir}/manifest3.json + input_field: duration + target_value: 0 + operator: gt + + - _target_: sdp.processors.ASRWhisper # pip install -U openai-whisper + output_manifest_file: ${workspace_dir}/manifest4.json pretrained_model: "large-v2" output_text_field: text - - _target_: sdp.processors.ASRTransformer - output_manifest_file: ${workspace_dir}/manifest4.json + - _target_: sdp.processors.ASRTransformers #pip install accelerate transformers + output_manifest_file: ${workspace_dir}/manifest5.json pretrained_model: "distil-whisper/distil-large-v2" #"openai/whisper-large-v3" output_text_field: pred_text - - _target_: sdp.processors.SubMakeLowercase - output_manifest_file: ${workspace_dir}/manifest5.json - text_key: "text" - - _target_: sdp.processors.DropNonAlphabet output_manifest_file: ${workspace_dir}/manifest6.json alphabet: "ԱԲԳԴԵԶԷԸԹԺԻԼԽԾԿՀՁՂՃՄՅՆՇՈՉՊՋՌՍՎՏՐՑՒՓՔՕՖՈՒԵ աբգդեզէըթժիլխծկհձղճմյնշոչպջռսվտրցւփքօֆուև.,!?" @@ -44,6 +81,10 @@ processors: - {input: {text: "test тест Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}, output: null} - {input: {text: "Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}, output: {text: "Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}} + - _target_: sdp.processors.SubMakeLowercase + output_manifest_file: ${workspace_dir}/manifest7.json + text_key: "text" + - _target_: sdp.processors.SubRegex output_manifest_file: ${final_manifest} regex_params_list: diff --git a/dataset_configs/armenian/text_mcv/config.yaml b/dataset_configs/armenian/text_mcv/config.yaml index 241de972..8bed0949 100644 --- a/dataset_configs/armenian/text_mcv/config.yaml +++ b/dataset_configs/armenian/text_mcv/config.yaml @@ -1,4 +1,41 @@ -# Processing pipeline for text corpus to submit to Common Voice https://common-voice.github.io/community-playbook/sub_pages/text.html +documentation: | + Text MCV + ###### + + This config can be used to prepare text corpus to submit to Common Voice https://common-voice.github.io/community-playbook/sub_pages/text.html + + This config performs the following data processing. + + 1. Create initial manifest by collecling all avalible files with txt expention in raw_data_dir folder. + 2. Read text files line by line. + 3. Normalize text lines using Regex. + 4. Split lines into sentences. + 5. Replaces common transcription errors as well as "non-linguistic", + "unintelligible" and "redacted" flags. + 6. Drops everything with non-armenean characters. + 7. Drops all utterances that are shorter than 3 words or longer than 15 words. + 8. Extract source book name. + 9. Convert into target csv format. + 10. Get random subsample. + + + **Required arguments**. + + * **workspace_dir**: specify the workspace folder where all audio files will be stored. + + Note that you can customize any part of this config either directly or from command-line. + Here are some common customizations to consider: + + **Output format**. + + Output manifest manifest12.json contain the following fields: + * **Sentence (str)**: text of sentence to vocalise. + * **Source (str)**: source book. + + Output manifest manifest13.tsv contain the same data as manifest12.json but in tsv format. + + Output manifest manifest14.tsv contain random subset of data from manifest13.json. + processors_to_run: "0:" workspace_dir: ??? final_manifest: ${workspace_dir}/manifest12.json From 2d5ee5bda188bb94c54d007bcb90642f0bb11dce Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Sun, 25 Feb 2024 22:55:25 -0800 Subject: [PATCH 081/115] black Signed-off-by: Nikolay Karpov --- requirements/main.txt | 3 - sdp/processors/__init__.py | 2 +- .../huggingface/speech_recognition.py | 21 +++-- .../modify_manifest/data_to_data.py | 77 ++++++++++++------- 4 files changed, 66 insertions(+), 37 deletions(-) diff --git a/requirements/main.txt b/requirements/main.txt index 0f829728..4e5c79fb 100644 --- a/requirements/main.txt +++ b/requirements/main.txt @@ -1,4 +1,3 @@ -accelerate diff_match_patch editdistance hydra-core @@ -6,11 +5,9 @@ joblib librosa>=0.10.0 # specify >=0.10.0 so that librosa.get_duration(path=...) will work numpy omegaconf -openai-whisper pandas regex sox tqdm -transformers wget # for some processers, additionally https://github.com/NVIDIA/NeMo is required diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py index 2b834bf9..f7a896e1 100644 --- a/sdp/processors/__init__.py +++ b/sdp/processors/__init__.py @@ -32,7 +32,7 @@ from sdp.processors.datasets.voxpopuli.normalize_from_non_pc_text import ( NormalizeFromNonPCTextVoxpopuli, ) -from sdp.processors.huggingface.speech_recognition import ASRTransformer, ASRWhisper +from sdp.processors.huggingface.speech_recognition import ASRTransformers, ASRWhisper from sdp.processors.modify_manifest.common import ( AddConstantFields, ChangeToRelativePath, diff --git a/sdp/processors/huggingface/speech_recognition.py b/sdp/processors/huggingface/speech_recognition.py index 4112112e..d9194c7a 100644 --- a/sdp/processors/huggingface/speech_recognition.py +++ b/sdp/processors/huggingface/speech_recognition.py @@ -17,6 +17,7 @@ from tqdm import tqdm +from sdp.logging import logger from sdp.processors.base_processor import BaseProcessor from sdp.utils.common import load_manifest @@ -41,9 +42,13 @@ def __init__( **kwargs, ): super().__init__(**kwargs) - import torch - import whisper # pip install -U openai-whisper + try: + import torch + import whisper + except: + raise ImportError("Need to install whisper: pip install -U openai-whisper") + logger.warning("This is an example processor, for demonstration only. Do not use it for production purposes.") self.whisper = whisper self.pretrained_model = pretrained_model self.output_text_field = output_text_field @@ -84,9 +89,9 @@ def whisper_infer(self, audio_path): return result.text, lang -class ASRTransformer(BaseProcessor): +class ASRTransformers(BaseProcessor): """ - Processor to transcribe using ASR Transformer model from HuggingFace. + Processor to transcribe using ASR Transformers model from HuggingFace. Args: pretrained_model (str): name of pretrained model on HuggingFace. @@ -106,9 +111,13 @@ def __init__( **kwargs, ): super().__init__(**kwargs) - import torch - from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline + try: + import torch + from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline + except: + raise ImportError("Need to install transformers: pip install accelerate transformers") + logger.warning("This is an example processor, for demonstration only. Do not use it for production purposes.") self.pretrained_model = pretrained_model self.output_text_field = output_text_field self.device = device diff --git a/sdp/processors/modify_manifest/data_to_data.py b/sdp/processors/modify_manifest/data_to_data.py index 9c8b1a84..8b635bd7 100644 --- a/sdp/processors/modify_manifest/data_to_data.py +++ b/sdp/processors/modify_manifest/data_to_data.py @@ -13,9 +13,10 @@ # limitations under the License. import collections -import re import os +import re from typing import Dict, List + import soundfile as sf from sdp.logging import logger @@ -27,14 +28,17 @@ class GetAudioDuration(BaseParallelProcessor): """ - Processor to count audio duration using audio file path from input_field + Processor that computes the duration of the file in audio_filepath_field (using soundfile) + and saves the duration in duration_field. If there is an error computing the duration, + the duration_field will be updated with the value -1.0. Args: audio_filepath_field (str): where to get path to wav file. duration_field (str): where to put to audio duration. Returns: - All the same fields as in the input manifest plus output_field + All the same fields as in the input manifest plus duration_field """ + def __init__( self, audio_filepath_field: str, @@ -44,21 +48,21 @@ def __init__( super().__init__(**kwargs) self.audio_filepath_field = audio_filepath_field self.duration_field = duration_field - + def process_dataset_entry(self, data_entry): audio_filepath = data_entry[self.audio_filepath_field] try: data, samplerate = sf.read(audio_filepath) - data_entry[self.duration_field]=data.shape[0]/samplerate + data_entry[self.duration_field] = data.shape[0] / samplerate except Exception as e: logger.warning(str(e) + " file: " + audio_filepath) data_entry[self.duration_field] = -1.0 return [DataEntry(data=data_entry)] - + class FfmpegConvert(BaseParallelProcessor): """ - Processor for converting video files to audio using FFmpeg and updating the dataset with the path to the resampled audio. + Processor for converting video or audio files to audio using FFmpeg and updating the dataset with the path to the resampled audio. Args: resampled_audio_dir (str): The directory to store the resampled audio files. @@ -70,6 +74,7 @@ class FfmpegConvert(BaseParallelProcessor): **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. """ + def __init__( self, resampled_audio_dir: str, @@ -92,18 +97,18 @@ def prepare(self): os.makedirs(self.resampled_audio_dir, exist_ok=True) def process_dataset_entry(self, data_entry): - video = data_entry[self.input_field] + input_file = data_entry[self.input_field] if self.key_field: key = data_entry[self.key_field] os.makedirs(os.path.join(self.resampled_audio_dir, key.split("/")[0]), exist_ok=True) else: - key = os.path.splitext(video)[0].split("/")[-1] + key = os.path.splitext(input_file)[0].split("/")[-1] audio = os.path.join(self.resampled_audio_dir, key) + ".wav" if not os.path.isfile(audio): - ffmpeg_convert(video, audio, self.target_samplerate, self.target_nchannels) + ffmpeg_convert(input_file, audio, self.target_samplerate, self.target_nchannels) - data_entry[self.output_field]= audio + data_entry[self.output_field] = audio if self.key_field: data_entry[self.key_field] = key return [DataEntry(data=data_entry)] @@ -111,7 +116,8 @@ def process_dataset_entry(self, data_entry): class ReadTxtLines(BaseParallelProcessor): """ - Processor for reading text lines from a file and updating the manifest. + The text file specified in source_filepath will be read, and each line in it will be added as a line in the output manifest, + saved in the field text_key. Args: source_filepath (str): The field containing the file path in the manifest. @@ -119,6 +125,7 @@ class ReadTxtLines(BaseParallelProcessor): **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. """ + def __init__( self, source_filepath: str, @@ -126,18 +133,18 @@ def __init__( **kwargs, ): super().__init__(**kwargs) - self.input_field = source_filepath - self.output_field = text_key + self.source_filepath = source_filepath + self.text_key = text_key def process_dataset_entry(self, data_entry): - fname = data_entry[self.input_field] + fname = data_entry[self.source_filepath] data_list = [] with open(fname, "r") as f: for line in f: line = line.strip() if line: data = data_entry.copy() - data[self.output_field] = line + data[self.text_key] = line data_list.append(DataEntry(data=data)) return data_list @@ -145,13 +152,15 @@ def process_dataset_entry(self, data_entry): class SplitLineBySentence(BaseParallelProcessor): """ Processor for splitting lines of text into sentences based on a specified pattern. + One line containing N sentences will be transformed into N lines containing one sentence. Args: - text_key (str): The field containing the input text lines in the dataset. + text_key (str): The field containing the text lines in the dataset. end_pattern (str): The regular expression pattern to identify sentence boundaries. **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. """ + def __init__( self, text_key: str, @@ -169,21 +178,21 @@ def process_dataset_entry(self, data_entry): ends = [m.start() for m in self.pattern.finditer(line)] if ends: for end in ends: - sent = line[start:end+1].strip() + sent = line[start : end + 1].strip() # if sent and sent[0].isupper(): data = data_entry.copy() data[self.text_key] = sent data_list.append(DataEntry(data=data)) - start = end+1 - if start Date: Tue, 27 Feb 2024 07:07:24 -0800 Subject: [PATCH 082/115] lanID Signed-off-by: Nikolay Karpov --- .../datasets/commoncrawl/commoncrawl.py | 26 ++++++++++++------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py index 5a9b7f76..010d2213 100644 --- a/sdp/processors/datasets/commoncrawl/commoncrawl.py +++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py @@ -748,8 +748,8 @@ def __init__( self.iso_m = {'English':'en', 'Spanish':'es', 'Basque':'eu', 'Dutch':'nl', 'Welsh':'cy', 'Italian':'it', 'Catalan':'ca', 'Maltese':'mt', 'Swedish':'sv', 'French':'fr', 'German':'de', 'Chuvash':'cv', 'Kinyarwanda':'rw', 'Polish':'pl', 'Kabyle':'kab', 'Interlingua': 'ua', 'Portuguese': 'pt', 'Hakha_Chin': 'cnh', 'Romansh_Sursilvan':'roh', 'Breton':'br', 'Esperanto':'epo', 'Czech':'ces', 'Latvian':'lav', - 'Indonesian':'ind', 'Slovenian':'slv', 'Turkish':'tur', 'Frisian':'frr', 'Tatar':'tat', 'Persian':'fas', 'Estonian':'est', 'Romanian':'rum', 'Chinese_Hongkong':'zh', 'Chinese_Taiwan':'zh', - 'Georgian':'kat', 'Kyrgyz':'kir', 'Dhivehi':'div', 'Sakha':'sah'} + 'Indonesian':'ind', 'Slovenian':'slv', 'Turkish':'tur', 'Frisian':'frr', 'Tatar':'tat', 'Persian':'fas', 'Estonian':'est', 'Romanian':'rum', 'Chinese_Hongkong':'zh', 'Chinese_Taiwan':'zh', 'Chinese_China':'zh', + 'Georgian':'kat', 'Kyrgyz':'kir', 'Dhivehi':'div', 'Sakha':'sah', 'Arabic':'ar', 'Japanese': 'ja'} def process_dataset_entry(self, data_entry): data_entry[self.output_lang_field] = self.iso_m[data_entry[self.input_lang_field]] @@ -1269,7 +1269,7 @@ def __init__( resampled_audio_dir: str, input_field: str, output_field: str, - key_field: str, + key_field: str = None, target_samplerate: int = 16000, target_nchannels: int = 1, **kwargs, @@ -1282,17 +1282,25 @@ def __init__( self.target_samplerate = target_samplerate self.target_nchannels = target_nchannels + def prepare(self): + os.makedirs(self.resampled_audio_dir, exist_ok=True) + return super().prepare() + def process_dataset_entry(self, data_entry): - video = data_entry[self.input_field] - key = data_entry[self.key_field] - os.makedirs(os.path.join(self.resampled_audio_dir, key.split("/")[0]), exist_ok=True) + input_file = data_entry[self.input_field] + if self.key_field: + key = data_entry[self.key_field] + os.makedirs(os.path.join(self.resampled_audio_dir, key.split("/")[0]), exist_ok=True) + else: + key = os.path.splitext(input_file)[0].split("/")[-1] audio = os.path.join(self.resampled_audio_dir, key) + ".wav" if not os.path.isfile(audio): - ffmpeg_convert(video, audio, self.target_samplerate, self.target_nchannels) + ffmpeg_convert(input_file, audio, self.target_samplerate, self.target_nchannels) - data_entry[self.output_field]= audio - data_entry[self.key_field] = key + data_entry[self.output_field] = audio + if self.key_field: + data_entry[self.key_field] = key return [DataEntry(data=data_entry)] From 33b4f6270eb6e0df60eb875e2929c19c1e8ab7b0 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Tue, 27 Feb 2024 07:08:29 -0800 Subject: [PATCH 083/115] srt Signed-off-by: Nikolay Karpov --- .../datasets/commoncrawl/harv_utils.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/sdp/processors/datasets/commoncrawl/harv_utils.py b/sdp/processors/datasets/commoncrawl/harv_utils.py index ebc6f5b1..92b3ffd1 100644 --- a/sdp/processors/datasets/commoncrawl/harv_utils.py +++ b/sdp/processors/datasets/commoncrawl/harv_utils.py @@ -48,7 +48,14 @@ def load_manifest(manifest: Path, keys: List[str] = []) -> List[Dict[str, Union[ def get_vtt_text(vtt_file): text_all = [] - for caption in webvtt.read(vtt_file): + if os.path.splitext(vtt_file)[1]=='.vtt': + webvtt_i = webvtt.read + elif os.path.splitext(vtt_file)[1]=='.srt': + webvtt_i = webvtt.from_srt + else: + raise ValueError("Unsupported extention of file "+vtt_file) + + for caption in webvtt_i(vtt_file): text = caption.text if text.find("thumbnails")!=-1: pass @@ -122,7 +129,15 @@ def split_by_vtt_new(vtt_file, samplerate): try: _begin = datetime.strptime('00:00:00.000', '%H:%M:%S.%f') text_list, start_s, end_s = [], [], [] - for caption in webvtt.read(vtt_file): + if os.path.splitext(vtt_file)[1]=='.vtt': + webvtt_i = webvtt.read + elif os.path.splitext(vtt_file)[1]=='.srt': + webvtt_i = webvtt.from_srt + else: + raise ValueError("Unsupporte extention of file "+vtt_file) + + + for caption in webvtt_i(vtt_file): text = ' '.join(caption.text.split('\n')) _start = parse_hours(caption.start) From e4ebfa712056cf42b3fa4981ed3388ebf1b0ac83 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Tue, 27 Feb 2024 07:09:35 -0800 Subject: [PATCH 084/115] load_manifest Signed-off-by: Nikolay Karpov --- sdp/utils/common.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/sdp/utils/common.py b/sdp/utils/common.py index 45f04242..b5c82039 100644 --- a/sdp/utils/common.py +++ b/sdp/utils/common.py @@ -16,11 +16,21 @@ import tarfile import urllib import zipfile +import json import wget - +from pathlib import Path +from typing import Dict, List, Union from sdp.logging import logger +def load_manifest(manifest: Path) -> List[Dict[str, Union[str, float]]]: + # read NeMo manifest as a list of dicts + result = [] + with manifest.open() as f: + for line in f: + data = json.loads(line) + result.append(data) + return result def download_file(source_url: str, target_directory: str, verbose = True): # make sure target_directory is an absolute path to avoid bugs when we change directories to download data later From ab7e1d9a20a4c6292e06161ebe1efa0f3a0a9692 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Thu, 29 Feb 2024 07:31:11 -0800 Subject: [PATCH 085/115] docs Signed-off-by: Nikolay Karpov --- docs/src/sdp/existing_configs.rst | 27 ++++++++++++++++++- .../huggingface/speech_recognition.py | 2 +- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/docs/src/sdp/existing_configs.rst b/docs/src/sdp/existing_configs.rst index 987cb5de..bff45e6f 100644 --- a/docs/src/sdp/existing_configs.rst +++ b/docs/src/sdp/existing_configs.rst @@ -125,4 +125,29 @@ Corpus of Regional African American Language (CORAAL) .. toctree:: :hidden: - config-docs/english/coraal/config \ No newline at end of file + config-docs/english/coraal/config + +Corpus of Armenian Text to Upload into Common Voice (MCV) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Dataset link:** https://commonvoice.mozilla.org/ + +`config `__ | +:doc:`documentation ` + +.. toctree:: + :hidden: + + config-docs/armenian/text_mcv/config + +Corpus based on Armenian audiobooks +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +`config `__ | +:doc:`documentation ` + +.. toctree:: + :hidden: + + config-docs/armenian/audio_books/config \ No newline at end of file diff --git a/sdp/processors/huggingface/speech_recognition.py b/sdp/processors/huggingface/speech_recognition.py index d9194c7a..c4983774 100644 --- a/sdp/processors/huggingface/speech_recognition.py +++ b/sdp/processors/huggingface/speech_recognition.py @@ -97,7 +97,7 @@ class ASRTransformers(BaseProcessor): pretrained_model (str): name of pretrained model on HuggingFace. output_text_field (str): field to save transcription result. device (str): Inference device. - batch_size (int): Inference batch size. Used only batch_size = 1 TODO: support batch_size > 1 + batch_size (int): Inference batch size. Defaults to 1. TODO: support batch_size > 1 torch_dtype (str): Tensor data type. Default to "float32" """ From e63f2509a568ab06c7bf9d2487d399edb4ac2818 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Thu, 29 Feb 2024 08:14:15 -0800 Subject: [PATCH 086/115] black Signed-off-by: Nikolay Karpov --- sdp/processors/modify_manifest/data_to_data.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sdp/processors/modify_manifest/data_to_data.py b/sdp/processors/modify_manifest/data_to_data.py index 8b635bd7..07b668f5 100644 --- a/sdp/processors/modify_manifest/data_to_data.py +++ b/sdp/processors/modify_manifest/data_to_data.py @@ -63,12 +63,13 @@ def process_dataset_entry(self, data_entry): class FfmpegConvert(BaseParallelProcessor): """ Processor for converting video or audio files to audio using FFmpeg and updating the dataset with the path to the resampled audio. - + If key_field is not None it is used as an output file name. If key_field is None the output file name is the same as input file name with different extention + and input file name saves to key_field back. Args: resampled_audio_dir (str): The directory to store the resampled audio files. input_field (str): The field in the dataset representing the path to the input video or audio files. output_field (str): The field to store the path to the resampled audio files in the dataset. - key_field (str): The field in the dataset representing the unique key or identifier for each entry. + key_field (str): The field in the dataset representing the unique key or identifier for each entry. Defaults to None. target_samplerate (int, optional): The target sampling rate for the resampled audio. Defaults to 16000. target_nchannels (int, optional): The target number of channels for the resampled audio. Defaults to 1. **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. From 714a7d1d563529282a802631572c92a517fa472f Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Thu, 29 Feb 2024 23:21:22 -0800 Subject: [PATCH 087/115] key Signed-off-by: Nikolay Karpov --- .../armenian/audio_books/config.yaml | 19 ++-- dataset_configs/armenian/text_mcv/config.yaml | 12 +-- .../huggingface/speech_recognition.py | 18 ++-- sdp/processors/langs/armenian.py | 36 ++++--- .../modify_manifest/create_manifest.py | 19 ++-- .../modify_manifest/data_to_data.py | 62 +++++------ .../modify_manifest/data_to_dropbool.py | 101 ++++++++++++++---- 7 files changed, 167 insertions(+), 100 deletions(-) diff --git a/dataset_configs/armenian/audio_books/config.yaml b/dataset_configs/armenian/audio_books/config.yaml index db33767e..e670abcf 100644 --- a/dataset_configs/armenian/audio_books/config.yaml +++ b/dataset_configs/armenian/audio_books/config.yaml @@ -41,7 +41,7 @@ processors: - _target_: sdp.processors.CreateInitialManifestByExt raw_data_dir: ${workspace_dir}/mp3 extension: mp3 - output_field: source_filepath + output_file_key: source_filepath output_manifest_file: ${workspace_dir}/manifest0.json - _target_: sdp.processors.FfmpegConvert @@ -49,30 +49,31 @@ processors: resampled_audio_dir: ${workspace_dir}/audio target_samplerate: 16000 target_nchannels: 1 - input_field: "source_filepath" - output_field: "audio_filepath" - key_field: null + input_file_key: "source_filepath" + output_file_key: "audio_filepath" + id_key: null - _target_: sdp.processors.GetAudioDuration - audio_filepath_field: audio_filepath - duration_field: duration + audio_file_key: audio_filepath + duration_key: duration output_manifest_file: ${workspace_dir}/manifest2.json - _target_: sdp.processors.PreserveByValue output_manifest_file: ${workspace_dir}/manifest3.json - input_field: duration + input_value_key: duration target_value: 0 operator: gt - _target_: sdp.processors.ASRWhisper # pip install -U openai-whisper output_manifest_file: ${workspace_dir}/manifest4.json pretrained_model: "large-v2" - output_text_field: text + output_text_key: text + output_lang_key: lid - _target_: sdp.processors.ASRTransformers #pip install accelerate transformers output_manifest_file: ${workspace_dir}/manifest5.json pretrained_model: "distil-whisper/distil-large-v2" #"openai/whisper-large-v3" - output_text_field: pred_text + output_text_key: pred_text - _target_: sdp.processors.DropNonAlphabet output_manifest_file: ${workspace_dir}/manifest6.json diff --git a/dataset_configs/armenian/text_mcv/config.yaml b/dataset_configs/armenian/text_mcv/config.yaml index 8bed0949..4a7237e2 100644 --- a/dataset_configs/armenian/text_mcv/config.yaml +++ b/dataset_configs/armenian/text_mcv/config.yaml @@ -44,11 +44,11 @@ processors: - _target_: sdp.processors.CreateInitialManifestByExt raw_data_dir: ${workspace_dir}/arm_docs extension: txt - output_field: source_filepath + output_file_key: source_filepath output_manifest_file: ${workspace_dir}/manifest0.json - _target_: sdp.processors.ReadTxtLines - source_filepath: source_filepath + input_file_key: source_filepath text_key: text_line output_manifest_file: ${workspace_dir}/manifest1.json @@ -109,20 +109,20 @@ processors: - _target_: sdp.processors.PreserveByValue output_manifest_file: ${workspace_dir}/manifest8.json - input_field: num_words + input_value_key: num_words target_value: 15 operator: le - _target_: sdp.processors.PreserveByValue output_manifest_file: ${workspace_dir}/manifest9.json - input_field: num_words + input_value_key: num_words target_value: 3 operator: ge - _target_: sdp.processors.langs.armenian.GetSourceBookName output_manifest_file: ${workspace_dir}/manifest10.json - source_filepath: source_filepath - source_field: Source + source_file_key: source_filepath + source_key: Source - _target_: sdp.processors.RenameFields output_manifest_file: ${workspace_dir}/manifest11.json diff --git a/sdp/processors/huggingface/speech_recognition.py b/sdp/processors/huggingface/speech_recognition.py index c4983774..d8702246 100644 --- a/sdp/processors/huggingface/speech_recognition.py +++ b/sdp/processors/huggingface/speech_recognition.py @@ -36,9 +36,9 @@ class ASRWhisper(BaseProcessor): def __init__( self, pretrained_model: str, - output_text_field: str, + output_text_key: str, device: str = None, - output_lang_field: str = "lid", + output_lang_key: str = "lid", **kwargs, ): super().__init__(**kwargs) @@ -51,9 +51,9 @@ def __init__( logger.warning("This is an example processor, for demonstration only. Do not use it for production purposes.") self.whisper = whisper self.pretrained_model = pretrained_model - self.output_text_field = output_text_field + self.output_text_key = output_text_key self.device = device - self.output_lang_field = output_lang_field + self.output_lang_key = output_lang_key if self.device is None: if torch.cuda.is_available(): self.device = "cuda" @@ -70,8 +70,8 @@ def process(self): for item in tqdm(json_list): pred_text, pred_lang = self.whisper_infer(item["audio_filepath"]) - item[self.output_text_field] = pred_text - item[self.output_lang_field] = pred_lang + item[self.output_text_key] = pred_text + item[self.output_lang_key] = pred_lang f.write(json.dumps(item, ensure_ascii=False) + '\n') def whisper_infer(self, audio_path): @@ -104,7 +104,7 @@ class ASRTransformers(BaseProcessor): def __init__( self, pretrained_model: str, - output_text_field: str, + output_text_key: str, device: str = None, batch_size: int = 1, torch_dtype: str = "float32", @@ -119,7 +119,7 @@ def __init__( logger.warning("This is an example processor, for demonstration only. Do not use it for production purposes.") self.pretrained_model = pretrained_model - self.output_text_field = output_text_field + self.output_text_key = output_text_key self.device = device self.batch_size = batch_size if torch_dtype == "float32": @@ -163,5 +163,5 @@ def process(self): for item in tqdm(json_list): pred_text = self.pipe(item["audio_filepath"])["text"] - item[self.output_text_field] = pred_text + item[self.output_text_key] = pred_text f.write(json.dumps(item, ensure_ascii=False) + '\n') diff --git a/sdp/processors/langs/armenian.py b/sdp/processors/langs/armenian.py index 1e290d29..586807ed 100644 --- a/sdp/processors/langs/armenian.py +++ b/sdp/processors/langs/armenian.py @@ -13,9 +13,15 @@ # limitations under the License. import os -import pandas as pd from pathlib import Path -from sdp.processors.base_processor import BaseProcessor, BaseParallelProcessor, DataEntry + +import pandas as pd + +from sdp.processors.base_processor import ( + BaseParallelProcessor, + BaseProcessor, + DataEntry, +) from sdp.utils.common import load_manifest @@ -24,25 +30,26 @@ class GetSourceBookName(BaseParallelProcessor): Processor for extracting source book name from file paths and updating the manifest. Args: - source_filepath (str): The field containing the file path in the manifest. - source_field (str): The field to store the extracted source book name in the manifest. + source_file_key (str): The field containing the file path in the manifest. + source_key (str): The field to store the extracted source book name in the manifest. **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. """ + def __init__( self, - source_filepath: str, - source_field: str, + source_file_key: str, + source_key: str, **kwargs, ): super().__init__(**kwargs) - self.source_filepath = source_filepath - self.source_field = source_field + self.source_file_key = source_file_key + self.source_key = source_key def process_dataset_entry(self, data_entry): - input_values = os.path.splitext(data_entry[self.source_filepath])[0].split("/") - - data_entry[self.source_field] = input_values[-1] + input_values = os.path.splitext(data_entry[self.source_file_key])[0].split("/") + + data_entry[self.source_key] = input_values[-1] return [DataEntry(data=data_entry)] @@ -54,10 +61,12 @@ class MakeTsv(BaseProcessor): **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`. """ + def process(self): df1 = pd.DataFrame.from_records(load_manifest(Path(self.input_manifest_file))) df1.to_csv(self.output_manifest_file, index=None, sep='\t') + class RandomTsvPart(BaseProcessor): """ Processor for creating a random subset of a TSV (Tab-Separated Values) file based on the specified fraction. @@ -68,6 +77,7 @@ class RandomTsvPart(BaseProcessor): **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`. """ + def __init__( self, part: float, @@ -80,4 +90,6 @@ def __init__( def process(self): df1 = pd.read_csv(self.input_manifest_file, sep='\t') - df1.sample(frac=self.part, random_state = self.random_state).to_csv(self.output_manifest_file, index=None, sep='\t') \ No newline at end of file + df1.sample(frac=self.part, random_state=self.random_state).to_csv( + self.output_manifest_file, index=None, sep='\t' + ) diff --git a/sdp/processors/modify_manifest/create_manifest.py b/sdp/processors/modify_manifest/create_manifest.py index 335724ca..d9cb0952 100644 --- a/sdp/processors/modify_manifest/create_manifest.py +++ b/sdp/processors/modify_manifest/create_manifest.py @@ -16,14 +16,15 @@ from sdp.processors.base_processor import BaseParallelProcessor, DataEntry + class CreateInitialManifestByExt(BaseParallelProcessor): """ Processor for creating an initial dataset manifest by saving filepaths with a common extension to the field specified in output_field. Args: raw_data_dir (str): The root directory of the files to be added to the initial manifest. This processor will recursively look for files with the extension 'extension' inside this directory. - output_field (str): The field to store the paths to the files in the dataset. - extension (str): The field stecify extension of the files to use them in the dataset. + output_file_key (str): The key to store the paths to the files in the dataset. + extension (str): The key to stecify extension of the files to use them in the dataset. **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. """ @@ -31,21 +32,19 @@ class CreateInitialManifestByExt(BaseParallelProcessor): def __init__( self, raw_data_dir: str, - output_field: str = "audio_filepath", + output_file_key: str = "audio_filepath", extension: str = "mp3", **kwargs, ): super().__init__(**kwargs) self.raw_data_dir = Path(raw_data_dir) - self.output_field = output_field + self.output_file_key = output_file_key self.extension = extension def read_manifest(self): - input_files = [str(self.raw_data_dir / file) for file in \ - self.raw_data_dir.rglob('*.' + self.extension)] - return input_files - + output_file = [str(self.raw_data_dir / file) for file in self.raw_data_dir.rglob('*.' + self.extension)] + return output_file + def process_dataset_entry(self, data_entry): - data = {self.output_field: data_entry} + data = {self.output_file_key: data_entry} return [DataEntry(data=data)] - \ No newline at end of file diff --git a/sdp/processors/modify_manifest/data_to_data.py b/sdp/processors/modify_manifest/data_to_data.py index 07b668f5..dd09f8dc 100644 --- a/sdp/processors/modify_manifest/data_to_data.py +++ b/sdp/processors/modify_manifest/data_to_data.py @@ -33,43 +33,43 @@ class GetAudioDuration(BaseParallelProcessor): the duration_field will be updated with the value -1.0. Args: - audio_filepath_field (str): where to get path to wav file. - duration_field (str): where to put to audio duration. + audio_file_key (str): Key to get path to wav file. + duration_key (str): Key to put to audio duration. Returns: All the same fields as in the input manifest plus duration_field """ def __init__( self, - audio_filepath_field: str, - duration_field: str, + audio_file_key: str, + duration_key: str, **kwargs, ): super().__init__(**kwargs) - self.audio_filepath_field = audio_filepath_field - self.duration_field = duration_field + self.audio_file_key = audio_file_key + self.duration_key = duration_key def process_dataset_entry(self, data_entry): - audio_filepath = data_entry[self.audio_filepath_field] + audio_filepath = data_entry[self.audio_file_key] try: data, samplerate = sf.read(audio_filepath) - data_entry[self.duration_field] = data.shape[0] / samplerate + data_entry[self.duration_key] = data.shape[0] / samplerate except Exception as e: logger.warning(str(e) + " file: " + audio_filepath) - data_entry[self.duration_field] = -1.0 + data_entry[self.duration_key] = -1.0 return [DataEntry(data=data_entry)] class FfmpegConvert(BaseParallelProcessor): """ Processor for converting video or audio files to audio using FFmpeg and updating the dataset with the path to the resampled audio. - If key_field is not None it is used as an output file name. If key_field is None the output file name is the same as input file name with different extention - and input file name saves to key_field back. + If id_key is not None it is used as an output file name. If id_key is None the output file name is the same as input file name with different extention + and input file name saves to id_key back. Args: resampled_audio_dir (str): The directory to store the resampled audio files. - input_field (str): The field in the dataset representing the path to the input video or audio files. - output_field (str): The field to store the path to the resampled audio files in the dataset. - key_field (str): The field in the dataset representing the unique key or identifier for each entry. Defaults to None. + input_file_key (str): The field in the dataset representing the path to the input video or audio files. + output_file_key (str): The field to store the path to the resampled audio files in the dataset. + id_key (str): The field in the dataset representing the unique ID or identifier for each entry. Defaults to None. target_samplerate (int, optional): The target sampling rate for the resampled audio. Defaults to 16000. target_nchannels (int, optional): The target number of channels for the resampled audio. Defaults to 1. **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. @@ -79,17 +79,17 @@ class FfmpegConvert(BaseParallelProcessor): def __init__( self, resampled_audio_dir: str, - input_field: str, - output_field: str, - key_field: str = None, + input_file_key: str, + output_file_key: str, + id_key: str = None, target_samplerate: int = 16000, target_nchannels: int = 1, **kwargs, ): super().__init__(**kwargs) - self.input_field = input_field - self.output_field = output_field - self.key_field = key_field + self.input_file_key = input_file_key + self.output_file_key = output_file_key + self.id_key = id_key self.resampled_audio_dir = resampled_audio_dir self.target_samplerate = target_samplerate self.target_nchannels = target_nchannels @@ -98,9 +98,9 @@ def prepare(self): os.makedirs(self.resampled_audio_dir, exist_ok=True) def process_dataset_entry(self, data_entry): - input_file = data_entry[self.input_field] - if self.key_field: - key = data_entry[self.key_field] + input_file = data_entry[self.input_file_key] + if self.id_key: + key = data_entry[self.id_key] os.makedirs(os.path.join(self.resampled_audio_dir, key.split("/")[0]), exist_ok=True) else: key = os.path.splitext(input_file)[0].split("/")[-1] @@ -109,9 +109,9 @@ def process_dataset_entry(self, data_entry): if not os.path.isfile(audio): ffmpeg_convert(input_file, audio, self.target_samplerate, self.target_nchannels) - data_entry[self.output_field] = audio - if self.key_field: - data_entry[self.key_field] = key + data_entry[self.output_file_key] = audio + if self.id_key: + data_entry[self.id_key] = key return [DataEntry(data=data_entry)] @@ -121,24 +121,24 @@ class ReadTxtLines(BaseParallelProcessor): saved in the field text_key. Args: - source_filepath (str): The field containing the file path in the manifest. - text_key (str): The field to store the read text lines in the manifest. + input_file_key (str): The key in the manifest containing the input txt file path . + text_key (str): The key to store the read text lines in the manifest. **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. """ def __init__( self, - source_filepath: str, + input_file_key: str, text_key: str, **kwargs, ): super().__init__(**kwargs) - self.source_filepath = source_filepath + self.input_file_key = input_file_key self.text_key = text_key def process_dataset_entry(self, data_entry): - fname = data_entry[self.source_filepath] + fname = data_entry[self.input_file_key] data_list = [] with open(fname, "r") as f: for line in f: diff --git a/sdp/processors/modify_manifest/data_to_dropbool.py b/sdp/processors/modify_manifest/data_to_dropbool.py index 640f2dd0..606d2bc5 100644 --- a/sdp/processors/modify_manifest/data_to_dropbool.py +++ b/sdp/processors/modify_manifest/data_to_dropbool.py @@ -14,8 +14,8 @@ import collections import re +from operator import eq, ge, gt, le, lt, ne from typing import List, Union -from operator import lt, le, eq, ne, ge, gt from sdp.logging import logger from sdp.processors.base_processor import BaseParallelProcessor, DataEntry @@ -35,21 +35,22 @@ class PreserveByValue(BaseParallelProcessor): Processor for preserving dataset entries based on a specified condition involving a target value and an input field. Args: - input_field (str): The field in the dataset entries to be evaluated. + input_value_key (str): The field in the dataset entries to be evaluated. target_value (Union[int, str]): The value to compare with the input field. operator (str, optional): The operator to apply for comparison. Options: "lt" (less than), "le" (less than or equal to), "eq" (equal to), "ne" (not equal to), "ge" (greater than or equal to), "gt" (greater than). Defaults to "eq". **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. """ + def __init__( self, - input_field: str, + input_value_key: str, target_value: Union[int, str], operator: str = "eq", **kwargs, ): super().__init__(**kwargs) - self.input_field = input_field + self.input_value_key = input_value_key self.target_value = target_value if operator == "lt": self.operator = lt @@ -64,16 +65,19 @@ def __init__( elif operator == "gt": self.operator = gt else: - raise ValueError('Operator must be one from the list: "lt" (less than), "le" (less than or equal to), "eq" (equal to), "ne" (not equal to), "ge" (greater than or equal to), "gt" (greater than)') + raise ValueError( + 'Operator must be one from the list: "lt" (less than), "le" (less than or equal to), "eq" (equal to), "ne" (not equal to), "ge" (greater than or equal to), "gt" (greater than)' + ) def process_dataset_entry(self, data_entry): - input_value = data_entry[self.input_field] + input_value = data_entry[self.input_value_key] target = self.target_value if self.operator(input_value, target): return [DataEntry(data=data_entry)] else: return [DataEntry(data=None)] - + + class DropHighLowCharrate(BaseParallelProcessor): """Drops utterances if their character rate is too low or too high. @@ -96,7 +100,11 @@ class DropHighLowCharrate(BaseParallelProcessor): """ def __init__( - self, high_charrate_threshold: float, low_charrate_threshold: float, text_key: str = "text", **kwargs, + self, + high_charrate_threshold: float, + low_charrate_threshold: float, + text_key: str = "text", + **kwargs, ): super().__init__(**kwargs) @@ -157,7 +165,11 @@ class DropHighLowWordrate(BaseParallelProcessor): """ def __init__( - self, high_wordrate_threshold: float, low_wordrate_threshold: float, text_key: str = "text", **kwargs, + self, + high_wordrate_threshold: float, + low_wordrate_threshold: float, + text_key: str = "text", + **kwargs, ): super().__init__(**kwargs) @@ -211,7 +223,11 @@ class DropHighLowDuration(BaseParallelProcessor): """ def __init__( - self, high_duration_threshold: float, low_duration_threshold: float, duration_key: str = "duration", **kwargs, + self, + high_duration_threshold: float, + low_duration_threshold: float, + duration_key: str = "duration", + **kwargs, ): super().__init__(**kwargs) self.high_duration_threshold = high_duration_threshold @@ -269,7 +285,10 @@ class DropIfNoneOfRegexMatch(BaseParallelProcessor): """ def __init__( - self, regex_patterns: List[str], text_key: str = "text", **kwargs, + self, + regex_patterns: List[str], + text_key: str = "text", + **kwargs, ): super().__init__(**kwargs) self.regex_patterns = regex_patterns @@ -316,7 +335,10 @@ class DropNonAlphabet(BaseParallelProcessor): """ def __init__( - self, alphabet: str, text_key: str = "text", **kwargs, + self, + alphabet: str, + text_key: str = "text", + **kwargs, ): super().__init__(**kwargs) self.alphabet = alphabet @@ -423,7 +445,8 @@ def finalize(self, metrics): beginning_drop_counter, ) logger.info( - "Num of utterances that were dropped due to asr insertions/deletions at the end: %d", end_drop_counter, + "Num of utterances that were dropped due to asr insertions/deletions at the end: %d", + end_drop_counter, ) super().finalize(metrics) @@ -445,7 +468,11 @@ class DropASRError(BaseParallelProcessor): """ def __init__( - self, consecutive_words_threshold: int, text_key: str = "text", pred_text_key: str = "pred_text", **kwargs, + self, + consecutive_words_threshold: int, + text_key: str = "text", + pred_text_key: str = "pred_text", + **kwargs, ): super().__init__(**kwargs) self.consecutive_words_threshold = consecutive_words_threshold @@ -487,7 +514,11 @@ class DropHighCER(BaseParallelProcessor): """ def __init__( - self, cer_threshold: float, text_key: str = "text", pred_text_key: str = "pred_text", **kwargs, + self, + cer_threshold: float, + text_key: str = "text", + pred_text_key: str = "pred_text", + **kwargs, ): super().__init__(**kwargs) self.cer_threshold = cer_threshold @@ -506,7 +537,9 @@ def finalize(self, metrics): for dropped in metrics: drop_counter += dropped logger.info( - "Num of utterances that were dropped due to CER > %d: %d", self.cer_threshold, drop_counter, + "Num of utterances that were dropped due to CER > %d: %d", + self.cer_threshold, + drop_counter, ) super().finalize(metrics) @@ -533,7 +566,11 @@ class DropHighWER(BaseParallelProcessor): """ def __init__( - self, wer_threshold: float, text_key: str = "text", pred_text_key: str = "pred_text", **kwargs, + self, + wer_threshold: float, + text_key: str = "text", + pred_text_key: str = "pred_text", + **kwargs, ): super().__init__(**kwargs) self.wer_threshold = wer_threshold @@ -552,7 +589,9 @@ def finalize(self, metrics): for dropped in metrics: drop_counter += dropped logger.info( - "Num of utterances that were dropped due to WER > %d: %d", self.wer_threshold, drop_counter, + "Num of utterances that were dropped due to WER > %d: %d", + self.wer_threshold, + drop_counter, ) super().finalize(metrics) @@ -579,7 +618,11 @@ class DropLowWordMatchRate(BaseParallelProcessor): """ def __init__( - self, wmr_threshold: float, text_key: str = "text", pred_text_key: str = "pred_text", **kwargs, + self, + wmr_threshold: float, + text_key: str = "text", + pred_text_key: str = "pred_text", + **kwargs, ): super().__init__(**kwargs) self.wmr_threshold = wmr_threshold @@ -599,7 +642,9 @@ def finalize(self, metrics): for dropped in metrics: drop_counter += dropped logger.info( - "Num of utterances that were dropped due to WMR < %d: %d", self.wmr_threshold, drop_counter, + "Num of utterances that were dropped due to WMR < %d: %d", + self.wmr_threshold, + drop_counter, ) super().finalize(metrics) @@ -625,7 +670,10 @@ class DropIfRegexMatch(BaseParallelProcessor): """ def __init__( - self, regex_patterns: List[str], text_key: str = "text", **kwargs, + self, + regex_patterns: List[str], + text_key: str = "text", + **kwargs, ): super().__init__(**kwargs) self.regex_patterns = regex_patterns @@ -666,7 +714,10 @@ class DropOnAttribute(BaseParallelProcessor): """ def __init__( - self, key: str, drop_if_false: bool = False, **kwargs, + self, + key: str, + drop_if_false: bool = False, + **kwargs, ): super().__init__(**kwargs) self.key = key @@ -710,7 +761,11 @@ class DropIfSubstringInInsertion(BaseParallelProcessor): """ def __init__( - self, substrings_in_insertion: List[str], text_key: str = "text", pred_text_key: str = "pred_text", **kwargs, + self, + substrings_in_insertion: List[str], + text_key: str = "text", + pred_text_key: str = "pred_text", + **kwargs, ): super().__init__(**kwargs) self.substrings_in_insertion = substrings_in_insertion From fcee1838e5bcd40debecf564c3561c7236322f62 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Thu, 14 Mar 2024 20:56:24 -0700 Subject: [PATCH 088/115] nemo file Signed-off-by: Nikolay Karpov --- .../datasets/commoncrawl/commoncrawl.py | 737 ++++++++++++++---- sdp/processors/nemo/asr_inference.py | 29 +- 2 files changed, 625 insertions(+), 141 deletions(-) diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py index 010d2213..dac8cd5a 100644 --- a/sdp/processors/datasets/commoncrawl/commoncrawl.py +++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py @@ -1,33 +1,52 @@ +import json +import math import os import re -import math -import json import shutil import subprocess +from operator import eq, ge, gt, le, lt, ne +from pathlib import Path +from typing import Dict, List, Union + import librosa -from tqdm import tqdm -import pandas as pd import numpy as np -from typing import Dict, List, Union -from pathlib import Path -from operator import lt, le, eq, ne, ge, gt +import pandas as pd import soundfile as sf from sacrebleu import BLEU +from scipy.spatial import distance +from tqdm import tqdm -from sdp.processors.base_processor import BaseProcessor, BaseParallelProcessor, DataEntry from sdp.logging import logger -from sdp.processors.datasets.commoncrawl.harv_utils import ffmpeg_convert, txt2vtt, make_trans_list, get_vtt_text, text2lid, load_manifest, read_jsonl, write_jsonl, split_by_vtt_new, audio_duration -from scipy.spatial import distance +from sdp.processors.base_processor import ( + BaseParallelProcessor, + BaseProcessor, + DataEntry, +) +from sdp.processors.datasets.commoncrawl.harv_utils import ( + audio_duration, + ffmpeg_convert, + get_vtt_text, + load_manifest, + make_trans_list, + read_jsonl, + split_by_vtt_new, + text2lid, + txt2vtt, + write_jsonl, +) + class ManifestToUtf8(BaseProcessor): """ Processor to convert manifest file to UTF-8 encoding. """ + def process(self): with open(self.output_manifest_file, "w") as wout, open(self.input_manifest_file) as win: for line in win: print(json.dumps(json.loads(line), ensure_ascii=False), file=wout) - + + class DropAbsPath(BaseParallelProcessor): """ Drop absolute path @@ -36,6 +55,7 @@ class DropAbsPath(BaseParallelProcessor): path_key (str): where to get path to wav file. abs_path_to_drop (str): string to drop from the bigining of path to wav file. """ + def __init__( self, path_key: str, @@ -45,17 +65,17 @@ def __init__( super().__init__(**kwargs) self.path_key = path_key self.abs_path_to_drop = abs_path_to_drop - + def process_dataset_entry(self, data_entry): audio_filepath = data_entry[self.path_key] - data_entry[self.path_key]=audio_filepath[len(self.abs_path_to_drop):] + data_entry[self.path_key] = audio_filepath[len(self.abs_path_to_drop) :] return [DataEntry(data=data_entry)] class CopyFiles(BaseParallelProcessor): def __init__( self, - file_field : str, + file_field: str, path_to_copy: str, path_levels: str = 1, **kwargs, @@ -69,7 +89,7 @@ def prepare(self): os.makedirs(self.path_to_copy, exist_ok=True) def process_dataset_entry(self, data_entry): - rel_file_path = "/".join(data_entry[self.file_field].split("/")[-self.path_levels:]) + rel_file_path = "/".join(data_entry[self.file_field].split("/")[-self.path_levels :]) new_file_path = os.path.join(self.path_to_copy, rel_file_path) if not os.path.isfile(new_file_path): @@ -82,7 +102,7 @@ def process_dataset_entry(self, data_entry): class GetSpecificFiles(BaseParallelProcessor): def __init__( self, - file_field : str, + file_field: str, path_to_copy: str, **kwargs, ): @@ -91,16 +111,57 @@ def __init__( self.path_to_copy = path_to_copy self.split_map = set( - ['0634236', '0693626', '0029743', '0881322', '0357427', '0455788', '0198472', '0496259', '0812890', '0142281', '0076612', '0629004', '0931592', '0577447', '0768107', '0907768', '0963898', '0671754', '0851569', '0896715', - '0366790', '0837221', '0733702', '0278253', '0738313', '0437256', '0558223', '0292533', '0777911', '0826607', '0544257', '0744206', '0576248', '0307575', '0307577', '0879895', '0006783', '0006755', '0125649', '0896701'] + [ + '0634236', + '0693626', + '0029743', + '0881322', + '0357427', + '0455788', + '0198472', + '0496259', + '0812890', + '0142281', + '0076612', + '0629004', + '0931592', + '0577447', + '0768107', + '0907768', + '0963898', + '0671754', + '0851569', + '0896715', + '0366790', + '0837221', + '0733702', + '0278253', + '0738313', + '0437256', + '0558223', + '0292533', + '0777911', + '0826607', + '0544257', + '0744206', + '0576248', + '0307575', + '0307577', + '0879895', + '0006783', + '0006755', + '0125649', + '0896701', + ] ) + def prepare(self): os.makedirs(self.path_to_copy, exist_ok=True) def process_dataset_entry(self, data_entry): file_id = os.path.splitext(data_entry[self.file_field])[0].split("/")[-1] if file_id in self.split_map: - shutil.copyfile(data_entry[self.file_field],os.path.join(self.path_to_copy, file_id+".wav")) + shutil.copyfile(data_entry[self.file_field], os.path.join(self.path_to_copy, file_id + ".wav")) return [DataEntry(data=data_entry)] else: return [] @@ -136,31 +197,330 @@ def __init__( self.split_map = {} self.split_map["en"] = {} self.split_map["en"]["dev"] = set( - ['0634236', '0693626', '0029743', '0881322', '0357427', '0455788', '0198472', '0496259', '0812890', '0142281', '0076612', '0629004', '0931592', '0577447', '0768107', '0907768', '0963898', '0671754', '0851569', '0896715'] + [ + '0634236', + '0693626', + '0029743', + '0881322', + '0357427', + '0455788', + '0198472', + '0496259', + '0812890', + '0142281', + '0076612', + '0629004', + '0931592', + '0577447', + '0768107', + '0907768', + '0963898', + '0671754', + '0851569', + '0896715', + ] ) self.split_map["en"]["test"] = set( - ['0366790', '0837221', '0733702', '0278253', '0738313', '0437256', '0558223', '0292533', '0777911', '0826607', '0544257', '0744206', '0576248', '0307575', '0307577', '0879895', '0006783', '0006755', '0125649', '0896701'] + [ + '0366790', + '0837221', + '0733702', + '0278253', + '0738313', + '0437256', + '0558223', + '0292533', + '0777911', + '0826607', + '0544257', + '0744206', + '0576248', + '0307575', + '0307577', + '0879895', + '0006783', + '0006755', + '0125649', + '0896701', + ] ) self.split_map["de"] = {} self.split_map["de"]["dev"] = set( - ['0383522', '0327835', '0327898', '0619871', '0387103', '0854766', '0738911', '0739038', '0854558', '0505561', '0735963', '0086041', '0967593', '0114210', '0098270', '0387140', '0917035', '0327745', '0914212', '0739071'] + [ + '0383522', + '0327835', + '0327898', + '0619871', + '0387103', + '0854766', + '0738911', + '0739038', + '0854558', + '0505561', + '0735963', + '0086041', + '0967593', + '0114210', + '0098270', + '0387140', + '0917035', + '0327745', + '0914212', + '0739071', + ] ) self.split_map["de"]["test"] = set( - ['0076939', '0589098', '0916988', '0268959', '0085896', '0327813', '0085897', '0739103', '0502188', '0034822', '0327729', '0572412', '0327680', '0027277', '0324720', '0209876', '0027226', '0268926', '0209776', '0738970'] + [ + '0076939', + '0589098', + '0916988', + '0268959', + '0085896', + '0327813', + '0085897', + '0739103', + '0502188', + '0034822', + '0327729', + '0572412', + '0327680', + '0027277', + '0324720', + '0209876', + '0027226', + '0268926', + '0209776', + '0738970', + ] ) self.split_map["pl"] = {} self.split_map["pl"]["dev"] = set( - ['0977373', '0949141', '0455759', '0357429', '0401864', '0714974', '0422716', '0363476', '0714976', '0927100'] + [ + '0977373', + '0949141', + '0455759', + '0357429', + '0401864', + '0714974', + '0422716', + '0363476', + '0714976', + '0927100', + ] ) self.split_map["pl"]["test"] = set( - ['0157903', '0115644', '0774572', '0688432', '0258376', '0396163', '0456013', '0571489', '0157653', '0062567'] + [ + '0157903', + '0115644', + '0774572', + '0688432', + '0258376', + '0396163', + '0456013', + '0571489', + '0157653', + '0062567', + ] ) self.split_map["fr"] = {} self.split_map["fr"]["dev"] = set( - ['0588135', '0706751', '0533213', '0920924', '0355413', '0985711', '0113477', '0533044', '0089551', '0944509', '0944576', '0766533', '0263084', '0113490', '0647104', '0273918', '0473607', '0706753', '0800223', '0300105', '0944416', '0566712', '0533102', '0177064', '0029651', '0215767', '0054412', '0236920', '0885068', '0296098', '0113592', '0706610', '0473383', '0330163', '0681542', '0272523', '0985709', '0564446', '0944481', '0587986', '0804060', '0236908', '0969694', '0054058', '0800671', '0236923', '0986025', '0770086', '0825692', '0968870', '0152315', '0533147', '0647027', '0029342', '0272698', '0153863', '0355323', '0988779', '0985959', '0237013', '0338134', '0885097', '0507678', '0507687', '0944485', '0825768', '0742440', '0969664', '0885089', '0117211', '0296044', '0985958', '0214384', '0021267', '0565392', '0388467', '0151715', '0861950', '0112768', '0113596', '0621657', '0236860', '0647128', '0058479', '0803614', '0177501', '0533110', '0566787', '0944496', '0859701', '0885165', '0212639', '0054532', '0919263', '0740701'] + [ + '0588135', + '0706751', + '0533213', + '0920924', + '0355413', + '0985711', + '0113477', + '0533044', + '0089551', + '0944509', + '0944576', + '0766533', + '0263084', + '0113490', + '0647104', + '0273918', + '0473607', + '0706753', + '0800223', + '0300105', + '0944416', + '0566712', + '0533102', + '0177064', + '0029651', + '0215767', + '0054412', + '0236920', + '0885068', + '0296098', + '0113592', + '0706610', + '0473383', + '0330163', + '0681542', + '0272523', + '0985709', + '0564446', + '0944481', + '0587986', + '0804060', + '0236908', + '0969694', + '0054058', + '0800671', + '0236923', + '0986025', + '0770086', + '0825692', + '0968870', + '0152315', + '0533147', + '0647027', + '0029342', + '0272698', + '0153863', + '0355323', + '0988779', + '0985959', + '0237013', + '0338134', + '0885097', + '0507678', + '0507687', + '0944485', + '0825768', + '0742440', + '0969664', + '0885089', + '0117211', + '0296044', + '0985958', + '0214384', + '0021267', + '0565392', + '0388467', + '0151715', + '0861950', + '0112768', + '0113596', + '0621657', + '0236860', + '0647128', + '0058479', + '0803614', + '0177501', + '0533110', + '0566787', + '0944496', + '0859701', + '0885165', + '0212639', + '0054532', + '0919263', + '0740701', + ] ) self.split_map["fr"]["test"] = set( - ['0473649', '0390470', '0296024', '0355365', '0314592', '0682498', '0534637', '0270580', '0532999', '0373977', '0622032', '0825761', '0923303', '0113485', '0825868', '0473710', '0511698', '0844353', '0801733', '0091695', '0452351', '0825872', '0969173', '0986055', '0970208', '0141266', '0149629', '0296117', '0153112', '0801752', '0030816', '0508766', '0029390', '0825877', '0271152', '0388655', '0743376', '0177466', '0153032', '0329945', '0473606', '0986015', '0096178', '0089561', '0440564', '0741466', '0499703', '0272514', '0944571', '0919512', '0646950', '0533215', '0760703', '0733028', '0113488', '0825739', '0492402', '0214463', '0154278', '0801877', '0825675', '0675029', '0801729', '0414446', '0054425', '0279176', '0296100', '0355317', '0733026', '0089548', '0177502', '0851638', '0851640', '0448606', '0803096', '0766603', '0507914', '0092173', '0647061', '0473564', '0706765', '0766538', '0295994', '0851630', '0029358', '0647062', '0825838', '0153786', '0944526', '0944484', '0588046', '0706820', '0177465', '0622092', '0332657', '0944480'] + [ + '0473649', + '0390470', + '0296024', + '0355365', + '0314592', + '0682498', + '0534637', + '0270580', + '0532999', + '0373977', + '0622032', + '0825761', + '0923303', + '0113485', + '0825868', + '0473710', + '0511698', + '0844353', + '0801733', + '0091695', + '0452351', + '0825872', + '0969173', + '0986055', + '0970208', + '0141266', + '0149629', + '0296117', + '0153112', + '0801752', + '0030816', + '0508766', + '0029390', + '0825877', + '0271152', + '0388655', + '0743376', + '0177466', + '0153032', + '0329945', + '0473606', + '0986015', + '0096178', + '0089561', + '0440564', + '0741466', + '0499703', + '0272514', + '0944571', + '0919512', + '0646950', + '0533215', + '0760703', + '0733028', + '0113488', + '0825739', + '0492402', + '0214463', + '0154278', + '0801877', + '0825675', + '0675029', + '0801729', + '0414446', + '0054425', + '0279176', + '0296100', + '0355317', + '0733026', + '0089548', + '0177502', + '0851638', + '0851640', + '0448606', + '0803096', + '0766603', + '0507914', + '0092173', + '0647061', + '0473564', + '0706765', + '0766538', + '0295994', + '0851630', + '0029358', + '0647062', + '0825838', + '0153786', + '0944526', + '0944484', + '0588046', + '0706820', + '0177465', + '0622092', + '0332657', + '0944480', + ] ) def process_dataset_entry(self, data_entry): @@ -182,7 +542,7 @@ class JoinBy(BaseProcessor): input_field (str): where to get path to wav file. text_field (str): where to put resulted text. audio_field (str): where to put resulted wav file. - + Returns: All the same fields as in the input manifest plus audio_field """ @@ -204,11 +564,15 @@ def process(self): pattern = re.compile("\s{2,}") df1[self.text_field] = df1[self.text_field].apply(lambda x: pattern.sub(" ", x).strip()) # df1["source"] = df1["audio_filepath"].apply(lambda x: x.split("/")[-2]) - - df2 = pd.DataFrame(df1.groupby(self.input_field).apply(lambda in_df: " ".join(in_df[self.text_field].tolist())), columns=[self.text_field]).reset_index() + + df2 = pd.DataFrame( + df1.groupby(self.input_field).apply(lambda in_df: " ".join(in_df[self.text_field].tolist())), + columns=[self.text_field], + ).reset_index() df2[self.audio_field] = df2[self.input_field] write_jsonl(df2[[self.audio_field, self.text_field]], self.output_manifest_file) + class AudioDuration(BaseParallelProcessor): """ Count audio duration using audio file path from input_field @@ -219,6 +583,7 @@ class AudioDuration(BaseParallelProcessor): Returns: All the same fields as in the input manifest plus output_field """ + def __init__( self, input_field: str, @@ -228,28 +593,30 @@ def __init__( super().__init__(**kwargs) self.input_field = input_field self.output_field = output_field - + def process_dataset_entry(self, data_entry): audio_filepath = data_entry[self.input_field] try: - data_entry[self.output_field]=audio_duration(audio_filepath) + data_entry[self.output_field] = audio_duration(audio_filepath) except Exception as e: logger.warning(str(e) + " file: " + audio_filepath) data_entry[self.output_field] = -1.0 return [DataEntry(data=data_entry)] + class EvalBandwidth(BaseParallelProcessor): """ Count audio bandwidth using audio file path from input_field - + Args: input_field (str): where to get path to wav file. output_field (str): where to put to frequency bandwidth. threshold (str): power threshold (in dB relative to peak power in spectrum bin) to estimate frequency bandwidth. - + Returns: All the same fields as in the input manifest plus output_field. """ + def __init__( self, input_field: str, @@ -261,14 +628,14 @@ def __init__( self.input_field = input_field self.output_field = output_field self.threshold = threshold - + def process_dataset_entry(self, data_entry): audio_filepath = data_entry[self.input_field] data, samplerate = sf.read(audio_filepath) freqband = self.eval_bandwidth(data, samplerate, threshold=self.threshold) - data_entry[self.output_field]=freqband + data_entry[self.output_field] = freqband return [DataEntry(data=data_entry)] - + def eval_bandwidth(self, signal, sr, threshold=-50): time_stride = 0.01 hop_length = int(sr * time_stride) @@ -284,10 +651,11 @@ def eval_bandwidth(self, signal, sr, threshold=-50): break return freqband + class SplitByAligner(BaseParallelProcessor): """ Split wav file using NFA aligner fields: nfa_start, nfa_duration - + Args: input_field (str): field to get source wav file names. output_field: (str): field to put splited wav file names. @@ -295,6 +663,7 @@ class SplitByAligner(BaseParallelProcessor): Returns: All the same fields as in the input manifest plus output_field. """ + def __init__( self, input_field: str, @@ -306,7 +675,7 @@ def __init__( self.input_field = input_field self.output_field = output_field self.splited_audio_dir = splited_audio_dir - + def prepare(self): os.makedirs(self.splited_audio_dir, exist_ok=True) @@ -317,23 +686,28 @@ def process_dataset_entry(self, data_entry): data, samplerate = sf.read(audio_filepath) nfa_start = data_entry["nfa_start"] nfa_duration = data_entry["nfa_duration"] - + if math.isnan(nfa_start) or math.isnan(nfa_duration) or math.isnan(samplerate): print(audio_filepath, nfa_start, nfa_duration) data_entry[self.output_field] = data_entry['audio_filepath'] else: - start = int(nfa_start*samplerate) - duration = int(nfa_duration*samplerate) - - data_sample = data[start : start+duration] + start = int(nfa_start * samplerate) + duration = int(nfa_duration * samplerate) - wav_save_file = os.path.join(self.splited_audio_dir, '/'.join(os.path.splitext(audio_filepath)[0].split('/')[-2:]), str(int(start*1000/samplerate))+"-"+str(int((start+duration)*1000/samplerate))+".wav") + data_sample = data[start : start + duration] + + wav_save_file = os.path.join( + self.splited_audio_dir, + '/'.join(os.path.splitext(audio_filepath)[0].split('/')[-2:]), + str(int(start * 1000 / samplerate)) + "-" + str(int((start + duration) * 1000 / samplerate)) + ".wav", + ) if not os.path.isfile(wav_save_file): os.makedirs(os.path.split(wav_save_file)[0], exist_ok=True) sf.write(wav_save_file, data_sample, samplerate) - data_entry[self.output_field]=wav_save_file + data_entry[self.output_field] = wav_save_file return [DataEntry(data=data_entry)] + class ASR_HF(BaseProcessor): """ Transcribe usinf ASR model from HuggingFace. @@ -346,6 +720,7 @@ class ASR_HF(BaseProcessor): Returns: All the same fields as in the input manifest plus output_text_field. """ + def __init__( self, pretrained_model: str, @@ -359,7 +734,7 @@ def __init__( self.output_text_field = output_text_field self.device = device self.batch_size = batch_size - + def process(self): import torch from huggingsound import SpeechRecognitionModel @@ -370,28 +745,25 @@ def process(self): else: self.device = "cpu" - model = SpeechRecognitionModel(self.pretrained_model, - device = self.device, - letter_case = None) + model = SpeechRecognitionModel(self.pretrained_model, device=self.device, letter_case=None) - manifest, key_dict = load_manifest(Path(self.input_manifest_file), keys = ["audio_filepath"]) + manifest, key_dict = load_manifest(Path(self.input_manifest_file), keys=["audio_filepath"]) audio_paths = key_dict["audio_filepath"] Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True) - transcriptions = model.transcribe(paths = audio_paths, - batch_size = self.batch_size, - decoder=None) - + transcriptions = model.transcribe(paths=audio_paths, batch_size=self.batch_size, decoder=None) + with Path(self.output_manifest_file).open('w') as f: for item, transcription in tqdm(zip(manifest, transcriptions)): item[self.output_text_field] = transcription["transcription"] f.write(json.dumps(item, ensure_ascii=False) + '\n') + class UseSonar(BaseProcessor): """ Count vector distance using Sonar library. - + Args: input_text_field (str): field with text to process. input_audio_field (str): field with audio file path to process. @@ -404,6 +776,7 @@ class UseSonar(BaseProcessor): Returns: All the same fields as in the input manifest plus output_field. """ + def __init__( self, input_text_field: str, @@ -418,16 +791,16 @@ def __init__( ): super().__init__(**kwargs) import torch # importing after nemo to make sure users first install nemo, instead of torch, then nemo - from torch.nn import PairwiseDistance - from sonar.inference_pipelines.text import TextToEmbeddingModelPipeline from sonar.inference_pipelines.speech import SpeechToEmbeddingModelPipeline - + from sonar.inference_pipelines.text import TextToEmbeddingModelPipeline from sonar.models.sonar_speech.loader import load_sonar_speech_model from sonar.models.sonar_text import ( load_sonar_text_decoder_model, load_sonar_text_encoder_model, load_sonar_tokenizer, ) + from torch.nn import PairwiseDistance + self.output_field = output_field self.input_text_field = input_text_field self.input_audio_field = input_audio_field @@ -440,7 +813,7 @@ def __init__( self.pdist = PairwiseDistance(p=2) self.s2vec_model = SpeechToEmbeddingModelPipeline(encoder=self.speech_encoder_model) self.text_embedding_pipeline = TextToEmbeddingModelPipeline(self.text_encoder_model, self.text_tokenizer) - + def process(self): manifest = load_manifest(Path(self.input_manifest_file)) @@ -454,21 +827,29 @@ def process(self): f.write(json.dumps(item, ensure_ascii=False) + '\n') def get_pdist(self, input_texts, input_audios): - text_emb = self.text_embedding_pipeline.predict(input = input_texts, - batch_size = 1, - source_lang=self.text_encoder_lang) - - audio_emb = self.s2vec_model.predict(input = input_audios, - batch_size = 1, - n_parallel = 1, - pad_idx = 0, - n_prefetched_batches = 1,) + text_emb = self.text_embedding_pipeline.predict( + input=input_texts, batch_size=1, source_lang=self.text_encoder_lang + ) + + audio_emb = self.s2vec_model.predict( + input=input_audios, + batch_size=1, + n_parallel=1, + pad_idx=0, + n_prefetched_batches=1, + ) # pdist = self.pdist(text_emb, audio_emb).numpy().squeeze().astype(float).tolist() - pdist = distance.cdist(text_emb.numpy().astype(float), audio_emb.numpy().astype(float), 'sqeuclidean').squeeze().tolist() + pdist = ( + distance.cdist(text_emb.numpy().astype(float), audio_emb.numpy().astype(float), 'sqeuclidean') + .squeeze() + .tolist() + ) return pdist - + def process_batch(self): - manifest, dict_list = load_manifest(Path(self.input_manifest_file), keys = [self.input_audio_field, self.input_text_field]) + manifest, dict_list = load_manifest( + Path(self.input_manifest_file), keys=[self.input_audio_field, self.input_text_field] + ) manifest_len = len(manifest) Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True) with Path(self.output_manifest_file).open('w') as f: @@ -483,6 +864,7 @@ def process_batch(self): item[self.output_field] = dist f.write(json.dumps(item, ensure_ascii=False) + '\n') + class BLEUScore(BaseParallelProcessor): """ Count BLEU Score. @@ -494,6 +876,7 @@ class BLEUScore(BaseParallelProcessor): Returns: All the same fields as in the input manifest plus output_field. """ + def __init__( self, ref_field: str, @@ -506,16 +889,16 @@ def __init__( self.hyp_field = hyp_field self.output_field = output_field self.scorer = BLEU(effective_order=True) - + def process_dataset_entry(self, data_entry): ref = data_entry[self.ref_field] hyp = data_entry[self.hyp_field] - - res = self.scorer.sentence_score(hypothesis=hyp, - references=[ref]) + + res = self.scorer.sentence_score(hypothesis=hyp, references=[ref]) data_entry[self.output_field] = res.score return [DataEntry(data=data_entry)] + class Subprocess(BaseProcessor): """ Processor for handling subprocess execution with additional features for managing input and output manifests. @@ -539,6 +922,7 @@ class Subprocess(BaseProcessor): --whitelist=/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv" """ + def __init__( self, cmd: str, @@ -556,7 +940,13 @@ def __init__( def process(self): os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True) if self.cmd.find(self.input_manifest_file) != -1 or self.cmd.find(self.output_manifest_file) != -1: - logger.error("input_manifest_file "+self.input_manifest_file+" and output_manifest_file "+self.output_manifest_file+" should be exluded from cmd line!") + logger.error( + "input_manifest_file " + + self.input_manifest_file + + " and output_manifest_file " + + self.output_manifest_file + + " should be exluded from cmd line!" + ) raise ValueError process_args = [x for x in self.cmd.split(" ") if x] if self.arg_separator == " ": @@ -572,6 +962,7 @@ def process(self): subprocess.run(process_args) + class NmtSubprocess(Subprocess): """ A class for executing Neural Machine Translation (NMT) subprocess with enhanced functionality for managing input and output fields. @@ -598,14 +989,22 @@ def __init__( self.output_field = output_field self.srctext_file = srctext_file self.tgtout_file = tgtout_file - self.cmd = self.cmd + " --srctext" + self.arg_separator + self.srctext_file + " --tgtout" + self.arg_separator + self.tgtout_file + self.cmd = ( + self.cmd + + " --srctext" + + self.arg_separator + + self.srctext_file + + " --tgtout" + + self.arg_separator + + self.tgtout_file + ) def process(self): df1 = read_jsonl(self.input_manifest_file) with Path(self.srctext_file).open('w') as f: for input_field in df1[self.input_field]: f.write(input_field + "\n") - + super().process() with Path(self.tgtout_file).open('r') as f: @@ -613,6 +1012,7 @@ def process(self): df1[self.output_field] = tgtout write_jsonl(df1, self.output_manifest_file) + class AlignerSubprocess(Subprocess): """ A class for aligning audio transcripts using an aligner subprocess with additional features for managing output fields. @@ -639,8 +1039,10 @@ def process(self): pattern = re.compile("\s{2,}") df1["text"] = df1["text"].apply(lambda x: pattern.sub(" ", x).strip()) df1["source"] = df1["audio_filepath"].apply(lambda x: x.split("/")[-2]) - - df2 = pd.DataFrame(df1.groupby("source_audio").apply(lambda in_df: "|".join(in_df["text"].tolist())), columns=["text"]).reset_index() + + df2 = pd.DataFrame( + df1.groupby("source_audio").apply(lambda in_df: "|".join(in_df["text"].tolist())), columns=["text"] + ).reset_index() df2['audio_filepath'] = df2['source_audio'] df2['duration'] = df2['audio_filepath'].apply(audio_duration) df2 = df2[df2['duration'] < self.duration_threshold] @@ -651,20 +1053,20 @@ def process(self): super().process() manifest_path, manifest_name = os.path.split(self.input_manifest_file) manifest_name = os.path.splitext(manifest_name)[0] - aligner_path = os.path.join(manifest_path,manifest_name+"_with_output_file_paths.json") + aligner_path = os.path.join(manifest_path, manifest_name + "_with_output_file_paths.json") df3 = read_jsonl(aligner_path) pattern = re.compile("") df4 = pd.DataFrame() - + for ctm_filepath in tqdm(df3["segments_level_ctm_filepath"]): source = os.path.splitext(ctm_filepath)[0].split('/')[-1] df6 = df1[df1["source"] == source].reset_index() - df5 = pd.read_csv(ctm_filepath, sep=' ', header=None, dtype={0:str}) + df5 = pd.read_csv(ctm_filepath, sep=' ', header=None, dtype={0: str}) df5["text"] = df5[4].apply(lambda x: pattern.sub(" ", x)) df5["nfa_start"] = df5[2] df5["nfa_duration"] = df5[3] if df5.shape[0] == df6.shape[0]: - df7 = df5[["nfa_start", "nfa_duration", "text"]].merge(df6, how="right") + df7 = df5[["nfa_start", "nfa_duration", "text"]].merge(df6, how="right") else: raise ValueError(ctm_filepath) @@ -672,7 +1074,7 @@ def process(self): write_jsonl(df4, self.output_manifest_file) - + class PreserveByValue(BaseParallelProcessor): """ A class for preserving dataset entries based on a specified condition involving a target value and an input field. @@ -685,6 +1087,7 @@ class PreserveByValue(BaseParallelProcessor): **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. """ + def __init__( self, input_field: str, @@ -715,7 +1118,8 @@ def process_dataset_entry(self, data_entry): return [DataEntry(data=data_entry)] else: return [DataEntry(data=None)] - + + class Lang2Iso(BaseParallelProcessor): """ A class for converting language names to ISO language codes in a dataset. @@ -736,6 +1140,7 @@ class Lang2Iso(BaseParallelProcessor): Note: - This class inherits from the `BaseParallelProcessor` class and extends its functionality to perform language name to ISO code conversion. """ + def __init__( self, input_lang_field: str, @@ -745,16 +1150,55 @@ def __init__( super().__init__(**kwargs) self.input_lang_field = input_lang_field self.output_lang_field = output_lang_field - self.iso_m = {'English':'en', 'Spanish':'es', 'Basque':'eu', 'Dutch':'nl', 'Welsh':'cy', 'Italian':'it', - 'Catalan':'ca', 'Maltese':'mt', 'Swedish':'sv', 'French':'fr', 'German':'de', 'Chuvash':'cv', - 'Kinyarwanda':'rw', 'Polish':'pl', 'Kabyle':'kab', 'Interlingua': 'ua', 'Portuguese': 'pt', 'Hakha_Chin': 'cnh', 'Romansh_Sursilvan':'roh', 'Breton':'br', 'Esperanto':'epo', 'Czech':'ces', 'Latvian':'lav', - 'Indonesian':'ind', 'Slovenian':'slv', 'Turkish':'tur', 'Frisian':'frr', 'Tatar':'tat', 'Persian':'fas', 'Estonian':'est', 'Romanian':'rum', 'Chinese_Hongkong':'zh', 'Chinese_Taiwan':'zh', 'Chinese_China':'zh', - 'Georgian':'kat', 'Kyrgyz':'kir', 'Dhivehi':'div', 'Sakha':'sah', 'Arabic':'ar', 'Japanese': 'ja'} - + self.iso_m = { + 'English': 'en', + 'Spanish': 'es', + 'Basque': 'eu', + 'Dutch': 'nl', + 'Welsh': 'cy', + 'Italian': 'it', + 'Catalan': 'ca', + 'Maltese': 'mt', + 'Swedish': 'sv', + 'French': 'fr', + 'German': 'de', + 'Chuvash': 'cv', + 'Kinyarwanda': 'rw', + 'Polish': 'pl', + 'Kabyle': 'kab', + 'Interlingua': 'ua', + 'Portuguese': 'pt', + 'Hakha_Chin': 'cnh', + 'Romansh_Sursilvan': 'roh', + 'Breton': 'br', + 'Esperanto': 'epo', + 'Czech': 'ces', + 'Latvian': 'lav', + 'Indonesian': 'ind', + 'Slovenian': 'slv', + 'Turkish': 'tur', + 'Frisian': 'frr', + 'Tatar': 'tat', + 'Persian': 'fas', + 'Estonian': 'est', + 'Romanian': 'rum', + 'Chinese_Hongkong': 'zh', + 'Chinese_Taiwan': 'zh', + 'Chinese_China': 'zh', + 'Georgian': 'kat', + 'Kyrgyz': 'kir', + 'Dhivehi': 'div', + 'Sakha': 'sah', + 'Arabic': 'ar', + 'Japanese': 'ja', + 'Russian': 'ru', + } + def process_dataset_entry(self, data_entry): data_entry[self.output_lang_field] = self.iso_m[data_entry[self.input_lang_field]] return [DataEntry(data=data_entry)] + class SplitByVttSentence(BaseParallelProcessor): """ A class for splitting audio files based on VTT (WebVTT) sentence-level segmentation in a dataset. @@ -778,6 +1222,7 @@ class SplitByVttSentence(BaseParallelProcessor): Note: - This class inherits from the `BaseParallelProcessor` class and extends its functionality to split audio files based on VTT segmentation. """ + def __init__( self, splited_audio_dir: str, @@ -816,36 +1261,47 @@ def process_dataset_entry(self, data_entry): if text_list: for text, start_sr, end_sr in zip(text_list, start_s, end_s): text_c += " " + text - if start_c==0: + if start_c == 0: start_c = start_sr else: pass end_c = end_sr - if len(text_c)>0 and (end_c - start_c > self.duration_threshold * samplerate or text_c[-1] == "." or text_c[-1] == "?"): - res_list.append(self.makeDataEntry(data_entry, data, vtt_file, samplerate, text_c, start_c, end_c)) + if len(text_c) > 0 and ( + end_c - start_c > self.duration_threshold * samplerate + or text_c[-1] == "." + or text_c[-1] == "?" + ): + res_list.append( + self.makeDataEntry(data_entry, data, vtt_file, samplerate, text_c, start_c, end_c) + ) text_c = '' start_c, end_c = 0, 0 else: pass - if len(text_c)>0 and start_c!=0: + if len(text_c) > 0 and start_c != 0: res_list.append(self.makeDataEntry(data_entry, data, vtt_file, samplerate, text_c, start_c, end_c)) - + return res_list def makeDataEntry(self, data_entry, data, vtt_file, samplerate, text_c, start_c, end_c): data_sample = data[start_c:end_c] - wav_save_file = os.path.join(self.splited_audio_dir, '/'.join(os.path.splitext(vtt_file)[0].split('/')[-2:]), str(int(start_c/(samplerate/1000)))+"-"+str(int(end_c/(samplerate/1000)))+".wav") + wav_save_file = os.path.join( + self.splited_audio_dir, + '/'.join(os.path.splitext(vtt_file)[0].split('/')[-2:]), + str(int(start_c / (samplerate / 1000))) + "-" + str(int(end_c / (samplerate / 1000))) + ".wav", + ) if not os.path.isfile(wav_save_file): os.makedirs(os.path.split(wav_save_file)[0], exist_ok=True) sf.write(wav_save_file, data_sample, samplerate) - - data = {self.target_audio_field: wav_save_file, - self.duration_field: data_sample.shape[0]/samplerate, - self.text_field: text_c.strip(), - } + + data = { + self.target_audio_field: wav_save_file, + self.duration_field: data_sample.shape[0] / samplerate, + self.text_field: text_c.strip(), + } for proxy_field in self.proxy_fields: data[proxy_field] = data_entry[proxy_field] - return DataEntry(data = data) + return DataEntry(data=data) class SplitByVtt(BaseParallelProcessor): @@ -871,6 +1327,7 @@ class SplitByVtt(BaseParallelProcessor): Note: - This class inherits from the `BaseParallelProcessor` class and extends its functionality to split audio files based on VTT segmentation. """ + def __init__( self, splited_audio_dir: str, @@ -908,14 +1365,21 @@ def process_dataset_entry(self, data_entry): wav_list, text_list, dur_list = split_by_vtt(vtt_file, source_audio, self.splited_audio_dir) if wav_list: for wav, text, dur in zip(wav_list, text_list, dur_list): - res_list.append(DataEntry(data = {self.target_audio_field: wav, - self.duration_field: dur, - self.text_field: text, - self.audio_lang_field: data_entry[self.audio_lang_field], - self.text_lang_field: data_entry[self.text_lang_field], - self.key_field: key})) + res_list.append( + DataEntry( + data={ + self.target_audio_field: wav, + self.duration_field: dur, + self.text_field: text, + self.audio_lang_field: data_entry[self.audio_lang_field], + self.text_lang_field: data_entry[self.text_lang_field], + self.key_field: key, + } + ) + ) return res_list + class AudioLid(BaseProcessor): """ A class for language identification (LID) of audio files using a pre-trained LID model. @@ -929,8 +1393,9 @@ class AudioLid(BaseProcessor): num_segments (int): Number of segments of file to use for majority vote. Delault is 1. random_seed (int): Seed for generating the starting position of the segment. Delault is None. **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`. - + """ + def __init__( self, input_audio_field: str, @@ -950,10 +1415,10 @@ def __init__( self.num_segments = num_segments self.random_seed = random_seed self.device = device - + def process(self): - import torch # importing after nemo to make sure users first install nemo, instead of torch, then nemo import nemo.collections.asr as nemo_asr + import torch # importing after nemo to make sure users first install nemo, instead of torch, then nemo model = nemo_asr.models.EncDecSpeakerLabelModel.from_pretrained(model_name=self.pretrained_model) @@ -975,7 +1440,7 @@ def process(self): try: lang = model.get_label(audio_file, self.segment_duration, self.num_segments) except Exception as e: - logger.warning("AudioLid " + audio_file+ " " + str(e)) + logger.warning("AudioLid " + audio_file + " " + str(e)) lang = None if lang: @@ -999,6 +1464,7 @@ class TextLid(BaseProcessor): - process(): Processes the language identification for each text in the dataset and saves the results in a new manifest file. """ + def __init__( self, input_text_field: str, @@ -1014,10 +1480,10 @@ def __init__( self.output_lang_field = output_lang_field self.device = device self.drop_duplicates = drop_text_duplicates - + def process(self): import torch # importing after nemo to make sure users first install nemo, instead of torch, then nemo - from transformers import AutoTokenizer, AutoModelForSequenceClassification + from transformers import AutoModelForSequenceClassification, AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(self.pretrained_model) text_model = AutoModelForSequenceClassification.from_pretrained(self.pretrained_model) @@ -1043,11 +1509,12 @@ def process(self): lid = text2lid(text_model, tokenizer, text) else: lid = None - + if lid: item[self.output_lang_field] = lid f.write(json.dumps(item, ensure_ascii=False) + '\n') + class AllVttText(BaseParallelProcessor): """ A class for extracting text content from VTT (WebVTT) files and updating the manifest. @@ -1061,6 +1528,7 @@ class AllVttText(BaseParallelProcessor): process_dataset_entry(data_entry): Processes a single dataset entry, extracts text content from the specified VTT file, and updates the manifest. """ + def __init__( self, output_text_field: str, @@ -1070,7 +1538,7 @@ def __init__( super().__init__(**kwargs) self.output_text_field = output_text_field self.input_filepath_field = input_filepath_field - + def process_dataset_entry(self, data_entry): vtt_file = data_entry[self.input_filepath_field] res_list = [DataEntry(data=None)] @@ -1099,6 +1567,7 @@ class TxtToVtt(BaseParallelProcessor): process_dataset_entry(data_entry): Processes a single dataset entry, converts the text content to VTT format, and updates the manifest. """ + def __init__( self, vtt_files_dir: str, @@ -1112,7 +1581,7 @@ def __init__( self.key_field = key_field self.text_field = text_field self.vtt_field = vtt_field - + self.trans_list = make_trans_list() def prepare(self): @@ -1124,13 +1593,14 @@ def process_dataset_entry(self, data_entry): os.makedirs(os.path.join(self.vtt_files_dir, key.split("/")[0]), exist_ok=True) vtt_file = os.path.join(self.vtt_files_dir, key) + ".vtt" - + txt2vtt(text_file, vtt_file, self.trans_list) data_entry[self.vtt_field] = vtt_file return [DataEntry(data=data_entry)] + class ReadParquet(BaseParallelProcessor): """ A class for reading information from Parquet files and updating the manifest with video URLs and captions. @@ -1147,6 +1617,7 @@ class ReadParquet(BaseParallelProcessor): - process_dataset_entry(data_entry): Processes a single dataset entry, extracts video URLs and captions based on the key, and updates the manifest. """ + def __init__( self, output_video_field: str, @@ -1173,7 +1644,7 @@ def prepare(self): self.urls = pd.concat([self.urls, df1]) except Exception as e: logger.warning(str(e) + ", file: " + parquet) - + def process_dataset_entry(self, data_entry): key = data_entry[self.key_field] key = key.split("/")[1] @@ -1186,10 +1657,12 @@ def process_dataset_entry(self, data_entry): logger.warning("Key without URL or caption: " + key) return [DataEntry(data=data_entry)] + def get_key(x): key = "/".join(os.path.splitext(x)[0].split("/")[-2:]) return key + class CreateInitialManifestCC(BaseParallelProcessor): """ A class for creating an initial dataset manifest from image and text files with common keys. @@ -1207,6 +1680,7 @@ class CreateInitialManifestCC(BaseParallelProcessor): process_dataset_entry(data_entry): Processes a single dataset entry, creating a DataEntry object with video, key, and text fields, and updates the dataset. """ + def __init__( self, raw_data_dir: str, @@ -1224,12 +1698,11 @@ def __init__( def prepare(self): os.makedirs(self.raw_data_dir, exist_ok=True) - def read_manifest(self): videos = [str(self.raw_data_dir / video) for video in self.raw_data_dir.rglob('*.jpg')] texts = [str(self.raw_data_dir / text) for text in self.raw_data_dir.rglob('*.txt')] v_df = pd.DataFrame({self.video_field: videos}) - t_df = pd.DataFrame({self.text_field: texts }) + t_df = pd.DataFrame({self.text_field: texts}) v_df[self.key_field] = v_df[self.video_field].apply(get_key) t_df[self.key_field] = t_df[self.text_field].apply(get_key) @@ -1239,11 +1712,9 @@ def read_manifest(self): return vt_df.values def process_dataset_entry(self, data_entry): - (video, key, text) = data_entry + (video, key, text) = data_entry - data = {self.video_field: video, - self.key_field: key, - self.text_field: text} + data = {self.video_field: video, self.key_field: key, self.text_field: text} return [DataEntry(data=data)] @@ -1264,6 +1735,7 @@ class FfmpegConvert(BaseParallelProcessor): process_dataset_entry(data_entry): Processes a single dataset entry, converts the input video to resampled audio, and updates the dataset. """ + def __init__( self, resampled_audio_dir: str, @@ -1285,7 +1757,7 @@ def __init__( def prepare(self): os.makedirs(self.resampled_audio_dir, exist_ok=True) return super().prepare() - + def process_dataset_entry(self, data_entry): input_file = data_entry[self.input_field] if self.key_field: @@ -1320,6 +1792,7 @@ class CreateInitialManifestExt(BaseParallelProcessor): process_dataset_entry(data_entry): Processes a single dataset entry, creating a DataEntry object with the audio file path, and updates the dataset. """ + def __init__( self, raw_data_dir: str, @@ -1339,9 +1812,9 @@ def read_manifest(self): input_files = [str(self.raw_data_dir / video) for video in self.raw_data_dir.rglob('*.' + self.extention)] v_df = pd.DataFrame({self.output_field: input_files}) return v_df.values - + def process_dataset_entry(self, data_entry): (inputf) = data_entry - + data = {self.output_field: inputf[0]} - return [DataEntry(data=data)] \ No newline at end of file + return [DataEntry(data=data)] diff --git a/sdp/processors/nemo/asr_inference.py b/sdp/processors/nemo/asr_inference.py index 561bb139..5af6e254 100644 --- a/sdp/processors/nemo/asr_inference.py +++ b/sdp/processors/nemo/asr_inference.py @@ -54,12 +54,23 @@ def __init__( def process(self): """This will add "pred_text" key into the output manifest.""" os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True) - subprocess.run( - f"python {self.script_path} " - f"pretrained_name={self.pretrained_model} " - f"dataset_manifest={self.input_manifest_file} " - f"output_filename={self.output_manifest_file} " - f"batch_size={self.batch_size} ", - shell=True, - check=True, - ) + if self.pretrained_model[-5:] == ".nemo": + subprocess.run( + f"python {self.script_path} " + f"model_path={self.pretrained_model} " + f"dataset_manifest={self.input_manifest_file} " + f"output_filename={self.output_manifest_file} " + f"batch_size={self.batch_size} ", + shell=True, + check=True, + ) + else: + subprocess.run( + f"python {self.script_path} " + f"pretrained_name={self.pretrained_model} " + f"dataset_manifest={self.input_manifest_file} " + f"output_filename={self.output_manifest_file} " + f"batch_size={self.batch_size} ", + shell=True, + check=True, + ) From 5efdbcd861caafda7b6db67db4bcc78878354674 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Thu, 14 Mar 2024 23:05:25 -0700 Subject: [PATCH 089/115] key style Signed-off-by: Nikolay Karpov --- dataset_configs/commoncrawl/big_sentence.yaml | 70 +-- .../datasets/commoncrawl/commoncrawl.py | 424 +++++------------- 2 files changed, 152 insertions(+), 342 deletions(-) diff --git a/dataset_configs/commoncrawl/big_sentence.yaml b/dataset_configs/commoncrawl/big_sentence.yaml index 48bff42c..173ed633 100644 --- a/dataset_configs/commoncrawl/big_sentence.yaml +++ b/dataset_configs/commoncrawl/big_sentence.yaml @@ -6,78 +6,78 @@ processors: - _target_: sdp.processors.datasets.commoncrawl.CreateInitialManifestCC raw_data_dir: /mnt/md0/common_crawl/output/video_output2 output_manifest_file: ${workspace_dir_s}/manifest0.json - video_field: "source_video" - text_field: "texts" - key_field: "key" + video_key: "source_video" + text_key: "texts" + id_key: "key" - _target_: sdp.processors.datasets.commoncrawl.ReadParquet raw_data_dir: /mnt/md0/common_crawl/output/video_output2 output_manifest_file: ${workspace_dir_s}/manifest1.json - output_video_field: video_url - output_caption_field: caption_url - key_field: key + output_video_key: video_url + output_caption_key: caption_url + id_key: key - - _target_: sdp.processors.datasets.commoncrawl.FfmpegConvert + - _target_: sdp.processors.FfmpegConverts output_manifest_file: ${workspace_dir_s}/manifest2.json #${workspace_dir_s}/manifest_urls.json resampled_audio_dir: ${workspace_dir}/audio target_samplerate: 16000 target_nchannels: 1 - input_field: "source_video" - output_field: "source_audio" - key_field: "key" + input_file_key: "source_video" + output_file_key: "source_audio" + id_key: "key" - - _target_: sdp.processors.datasets.commoncrawl.AudioDuration + - _target_: sdp.processors.GetAudioDuration output_manifest_file: ${workspace_dir_s}/manifest3.json - input_field: source_audio - output_field: duration + audio_file_key: source_audio + duration_key: duration - - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue + - _target_: sdp.processors.PreserveByValue output_manifest_file: ${workspace_dir_s}/manifest4.json - input_field: duration + input_value_key: duration target_value: 0 operator: gt - _target_: sdp.processors.datasets.commoncrawl.TxtToVtt output_manifest_file: ${workspace_dir_s}/manifest5.json vtt_files_dir: ${workspace_dir_s}/vtts - key_field: "key" - text_field: "texts" - vtt_field: "vtt_filepath" + id_key: "key" + text_key: "texts" + vtt_key: "vtt_filepath" - _target_: sdp.processors.datasets.commoncrawl.AllVttText output_manifest_file: ${workspace_dir_s}/manifest6.json - input_filepath_field: vtt_filepath - output_text_field: vtt_text + input_filepath_key: vtt_filepath + output_text_key: vtt_text - _target_: sdp.processors.datasets.commoncrawl.TextLid output_manifest_file: ${workspace_dir_s}/manifest7.json - input_text_field: vtt_text - output_lang_field: text_lang + input_text_key: vtt_text + output_lang_key: text_lang device: cuda pretrained_model: "jb2k/bert-base-multilingual-cased-language-detection" drop_text_duplicates: True - _target_: sdp.processors.datasets.commoncrawl.Lang2Iso output_manifest_file: ${workspace_dir_s}/manifest8.json - input_lang_field: text_lang - output_lang_field: text_lang + input_lang_key: text_lang + output_lang_key: text_lang - _target_: sdp.processors.datasets.commoncrawl.AudioLid output_manifest_file: ${workspace_dir_s}/manifest9.json - input_audio_field: source_audio - output_lang_field: audio_lang + input_audio_key: source_audio + output_lang_key: audio_lang device: cuda pretrained_model: "langid_ambernet" - _target_: sdp.processors.datasets.commoncrawl.SplitByVttSentence output_manifest_file: ${workspace_dir_s}/manifest10.json splited_audio_dir: ${workspace_dir_s}/splited/ - source_audio_field: source_audio - target_audio_field: audio_filepath - duration_field: duration - text_field: text - vtt_field: vtt_filepath - proxy_fields: [audio_lang, text_lang, source_audio] + source_audio_key: source_audio + target_audio_key: audio_filepath + duration_key: duration + text_key: text + vtt_key: vtt_filepath + proxy_keys: [audio_lang, text_lang, source_audio] duration_threshold: 10.0 - _target_: sdp.processors.DropHighLowDuration @@ -92,11 +92,11 @@ processors: - _target_: sdp.processors.datasets.commoncrawl.EvalBandwidth input_manifest_file: ${workspace_dir_s}/manifest5.json output_manifest_file: ${workspace_dir_s}/manifest5a.json - input_field: source_audio - output_field: bandwidth + input_file_key: source_audio + bandwidth_key: bandwidth - _target_: sdp.processors.datasets.commoncrawl.GetSpecificFiles input_manifest_file: ${workspace_dir_s}/manifest6.json output_manifest_file: ${workspace_dir_s}/long_dev_test/manifest6.json - file_field: source_audio + input_file_key: source_audio path_to_copy: ${workspace_dir_s}/long_dev_test \ No newline at end of file diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py index dac8cd5a..e982b9be 100644 --- a/sdp/processors/datasets/commoncrawl/commoncrawl.py +++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py @@ -102,12 +102,12 @@ def process_dataset_entry(self, data_entry): class GetSpecificFiles(BaseParallelProcessor): def __init__( self, - file_field: str, + input_file_key: str, path_to_copy: str, **kwargs, ): super().__init__(**kwargs) - self.file_field = file_field + self.input_file_key = input_file_key self.path_to_copy = path_to_copy self.split_map = set( @@ -159,9 +159,9 @@ def prepare(self): os.makedirs(self.path_to_copy, exist_ok=True) def process_dataset_entry(self, data_entry): - file_id = os.path.splitext(data_entry[self.file_field])[0].split("/")[-1] + file_id = os.path.splitext(data_entry[self.input_file_key])[0].split("/")[-1] if file_id in self.split_map: - shutil.copyfile(data_entry[self.file_field], os.path.join(self.path_to_copy, file_id + ".wav")) + shutil.copyfile(data_entry[self.input_file_key], os.path.join(self.path_to_copy, file_id + ".wav")) return [DataEntry(data=data_entry)] else: return [] @@ -573,44 +573,13 @@ def process(self): write_jsonl(df2[[self.audio_field, self.text_field]], self.output_manifest_file) -class AudioDuration(BaseParallelProcessor): - """ - Count audio duration using audio file path from input_field - - Args: - input_field (str): where to get path to wav file. - output_field (str): where to put to audio duration. - Returns: - All the same fields as in the input manifest plus output_field - """ - - def __init__( - self, - input_field: str, - output_field: str, - **kwargs, - ): - super().__init__(**kwargs) - self.input_field = input_field - self.output_field = output_field - - def process_dataset_entry(self, data_entry): - audio_filepath = data_entry[self.input_field] - try: - data_entry[self.output_field] = audio_duration(audio_filepath) - except Exception as e: - logger.warning(str(e) + " file: " + audio_filepath) - data_entry[self.output_field] = -1.0 - return [DataEntry(data=data_entry)] - - class EvalBandwidth(BaseParallelProcessor): """ Count audio bandwidth using audio file path from input_field Args: - input_field (str): where to get path to wav file. - output_field (str): where to put to frequency bandwidth. + input_file_key (str): where to get path to wav file. + bandwidth_key (str): where to put to frequency bandwidth. threshold (str): power threshold (in dB relative to peak power in spectrum bin) to estimate frequency bandwidth. Returns: @@ -619,21 +588,21 @@ class EvalBandwidth(BaseParallelProcessor): def __init__( self, - input_field: str, - output_field: str, + input_file_key: str, + bandwidth_key: str, threshold: int = -50, **kwargs, ): super().__init__(**kwargs) - self.input_field = input_field - self.output_field = output_field + self.input_file_key = input_file_key + self.bandwidth_key = bandwidth_key self.threshold = threshold def process_dataset_entry(self, data_entry): - audio_filepath = data_entry[self.input_field] + audio_filepath = data_entry[self.input_file_key] data, samplerate = sf.read(audio_filepath) freqband = self.eval_bandwidth(data, samplerate, threshold=self.threshold) - data_entry[self.output_field] = freqband + data_entry[self.bandwidth_key] = freqband return [DataEntry(data=data_entry)] def eval_bandwidth(self, signal, sr, threshold=-50): @@ -1125,31 +1094,20 @@ class Lang2Iso(BaseParallelProcessor): A class for converting language names to ISO language codes in a dataset. Parameters: - - input_lang_field (str): The field in the dataset containing language names to be converted. - - output_lang_field (str): The field to store the corresponding ISO language codes. - - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. - - Attributes: - - input_lang_field (str): The field in the dataset containing language names to be converted. - - output_lang_field (str): The field to store the corresponding ISO language codes. - - iso_m (dict): A mapping of language names to ISO language codes. + input_lang_key (str): The field in the dataset containing language names to be converted. + output_lang_key (str): The field to store the corresponding ISO language codes. - Methods: - - process_dataset_entry(data_entry): Processes a single dataset entry, converting language names to ISO language codes. - - Note: - - This class inherits from the `BaseParallelProcessor` class and extends its functionality to perform language name to ISO code conversion. """ def __init__( self, - input_lang_field: str, - output_lang_field: str, + input_lang_key: str, + output_lang_key: str, **kwargs, ): super().__init__(**kwargs) - self.input_lang_field = input_lang_field - self.output_lang_field = output_lang_field + self.input_lang_key = input_lang_key + self.output_lang_key = output_lang_key self.iso_m = { 'English': 'en', 'Spanish': 'es', @@ -1195,7 +1153,7 @@ def __init__( } def process_dataset_entry(self, data_entry): - data_entry[self.output_lang_field] = self.iso_m[data_entry[self.input_lang_field]] + data_entry[self.output_lang_key] = self.iso_m[data_entry[self.input_lang_key]] return [DataEntry(data=data_entry)] @@ -1204,58 +1162,49 @@ class SplitByVttSentence(BaseParallelProcessor): A class for splitting audio files based on VTT (WebVTT) sentence-level segmentation in a dataset. Parameters: - - splited_audio_dir (str): The directory to store the split audio files. - - source_audio_field (str): The field in the dataset containing the path to the source audio files. - - target_audio_field (str): The field to store the paths of the split audio files. - - duration_field (str): The field to store the duration of each split audio segment. - - text_field (str): The field to store the transcriptions corresponding to each split audio segment. - - vtt_field (str): The field in the dataset containing the path to the VTT (WebVTT) files for segmentation. - - proxy_fields (List[str], optional): List of additional fields to proxy from the original data entry to the split entries. Defaults to an empty list. - - duration_threshold (float, optional): The duration threshold in seconds for each split audio segment. Defaults to 10.0. - - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. - - - Methods: - - prepare(): Creates the directory to store the split audio files. - - process_dataset_entry(data_entry): Processes a single dataset entry, splitting audio based on VTT sentence-level segmentation. - - Note: - - This class inherits from the `BaseParallelProcessor` class and extends its functionality to split audio files based on VTT segmentation. + splited_audio_dir (str): The directory to store the split audio files. + source_audio_key (str): The field in the dataset containing the path to the source audio files. + target_audio_key (str): The field to store the paths of the split audio files. + duration_key (str): The field to store the duration of each split audio segment. + text_key (str): The field to store the transcriptions corresponding to each split audio segment. + caption_file_key (str): The field in the dataset containing the path to the VTT (WebVTT) files for segmentation. + proxy_keys (List[str], optional): List of additional fields to proxy from the original data entry to the split entries. Defaults to an empty list. + duration_threshold (float, optional): The duration threshold in seconds for each split audio segment. Defaults to 10.0. """ def __init__( self, splited_audio_dir: str, - source_audio_field: str, - target_audio_field: str, - duration_field: str, - text_field: str, - vtt_field: str, - proxy_fields: List[str] = [], + source_audio_key: str, + target_audio_key: str, + duration_key: str, + text_key: str, + caption_file_key: str, + proxy_keys: List[str] = [], duration_threshold: float = 10.0, **kwargs, ): super().__init__(**kwargs) self.splited_audio_dir = splited_audio_dir - self.source_audio_field = source_audio_field - self.target_audio_field = target_audio_field - self.duration_field = duration_field - self.text_field = text_field - self.vtt_field = vtt_field + self.source_audio_key = source_audio_key + self.target_audio_key = target_audio_key + self.duration_key = duration_key + self.text_key = text_key + self.caption_file_key = caption_file_key self.duration_threshold = duration_threshold - self.proxy_fields = proxy_fields + self.proxy_keys = proxy_keys def prepare(self): os.makedirs(self.splited_audio_dir, exist_ok=True) def process_dataset_entry(self, data_entry): - vtt_file = data_entry[self.vtt_field] - source_audio = data_entry[self.source_audio_field] + caption_file = data_entry[self.caption_file_key] + source_audio = data_entry[self.source_audio_key] res_list = [] if os.path.isfile(source_audio): data, samplerate = sf.read(source_audio) - text_list, start_s, end_s = split_by_vtt_new(vtt_file, samplerate) + text_list, start_s, end_s = split_by_vtt_new(caption_file, samplerate) text_c = '' start_c, end_c = 0, 0 if text_list: @@ -1272,14 +1221,16 @@ def process_dataset_entry(self, data_entry): or text_c[-1] == "?" ): res_list.append( - self.makeDataEntry(data_entry, data, vtt_file, samplerate, text_c, start_c, end_c) + self.makeDataEntry(data_entry, data, caption_file, samplerate, text_c, start_c, end_c) ) text_c = '' start_c, end_c = 0, 0 else: pass if len(text_c) > 0 and start_c != 0: - res_list.append(self.makeDataEntry(data_entry, data, vtt_file, samplerate, text_c, start_c, end_c)) + res_list.append( + self.makeDataEntry(data_entry, data, caption_file, samplerate, text_c, start_c, end_c) + ) return res_list @@ -1295,99 +1246,23 @@ def makeDataEntry(self, data_entry, data, vtt_file, samplerate, text_c, start_c, sf.write(wav_save_file, data_sample, samplerate) data = { - self.target_audio_field: wav_save_file, - self.duration_field: data_sample.shape[0] / samplerate, - self.text_field: text_c.strip(), + self.target_audio_key: wav_save_file, + self.duration_key: data_sample.shape[0] / samplerate, + self.text_key: text_c.strip(), } - for proxy_field in self.proxy_fields: - data[proxy_field] = data_entry[proxy_field] + for proxy_key in self.proxy_keys: + data[proxy_key] = data_entry[proxy_key] return DataEntry(data=data) -class SplitByVtt(BaseParallelProcessor): - """ - A class for splitting audio files based on VTT (WebVTT) segmentation in a dataset. - - Parameters: - - splited_audio_dir (str): The directory to store the split audio files. - - source_audio_field (str): The field in the dataset containing the path to the source audio files. - - text_lang_field (str): The field in the dataset containing the language information of the text. - - audio_lang_field (str): The field in the dataset containing the language information of the audio. - - key_field (str): The field in the dataset containing a unique key for each entry. - - target_audio_field (str): The field to store the paths of the split audio files. - - duration_field (str): The field to store the duration of each split audio segment. - - text_field (str): The field to store the transcriptions corresponding to each split audio segment. - - vtt_field (str): The field in the dataset containing the path to the VTT (WebVTT) files for segmentation. - - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. - - Methods: - - prepare(): Creates the directory to store the split audio files. - - process_dataset_entry(data_entry): Processes a single dataset entry, splitting audio based on VTT segmentation. - - Note: - - This class inherits from the `BaseParallelProcessor` class and extends its functionality to split audio files based on VTT segmentation. - """ - - def __init__( - self, - splited_audio_dir: str, - source_audio_field: str, - text_lang_field: str, - audio_lang_field: str, - key_field: str, - target_audio_field: str, - duration_field: str, - text_field: str, - vtt_field: str, - **kwargs, - ): - super().__init__(**kwargs) - self.splited_audio_dir = splited_audio_dir - self.source_audio_field = source_audio_field - self.text_lang_field = text_lang_field - self.audio_lang_field = audio_lang_field - self.key_field = key_field - self.target_audio_field = target_audio_field - self.duration_field = duration_field - self.text_field = text_field - self.vtt_field = vtt_field - - def prepare(self): - os.makedirs(self.splited_audio_dir, exist_ok=True) - - def process_dataset_entry(self, data_entry): - key = data_entry[self.key_field] - vtt_file = data_entry[self.vtt_field] - source_audio = data_entry[self.source_audio_field] - res_list = [] - - if os.path.isfile(source_audio): - wav_list, text_list, dur_list = split_by_vtt(vtt_file, source_audio, self.splited_audio_dir) - if wav_list: - for wav, text, dur in zip(wav_list, text_list, dur_list): - res_list.append( - DataEntry( - data={ - self.target_audio_field: wav, - self.duration_field: dur, - self.text_field: text, - self.audio_lang_field: data_entry[self.audio_lang_field], - self.text_lang_field: data_entry[self.text_lang_field], - self.key_field: key, - } - ) - ) - return res_list - - class AudioLid(BaseProcessor): """ A class for language identification (LID) of audio files using a pre-trained LID model. Args: - input_audio_field (str): The field in the dataset containing the path to the audio files for language identification. + input_audio_key (str): The field in the dataset containing the path to the audio files for language identification. pretrained_model (str): The name of the pre-trained ASR model for language identification. - output_lang_field (str): The field to store the identified language for each audio file. + output_lang_key (str): The field to store the identified language for each audio file. device (str): The device to run the ASR model on (e.g., 'cuda', 'cpu'). If None, it automatically selects the available GPU if present; otherwise, it uses the CPU. segment_duration (float): Random sample duration in seconds. Delault is np.inf. num_segments (int): Number of segments of file to use for majority vote. Delault is 1. @@ -1398,9 +1273,9 @@ class AudioLid(BaseProcessor): def __init__( self, - input_audio_field: str, + input_audio_key: str, pretrained_model: str, - output_lang_field: str, + output_lang_key: str, device: str, segment_duration: float = np.inf, num_segments: int = 1, @@ -1408,9 +1283,9 @@ def __init__( **kwargs, ): super().__init__(**kwargs) - self.input_audio_field = input_audio_field + self.input_audio_key = input_audio_key self.pretrained_model = pretrained_model - self.output_lang_field = output_lang_field + self.output_lang_key = output_lang_key self.segment_duration = segment_duration self.num_segments = num_segments self.random_seed = random_seed @@ -1435,7 +1310,7 @@ def process(self): Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True) with Path(self.output_manifest_file).open('w') as f: for item in tqdm(manifest): - audio_file = item[self.input_audio_field] + audio_file = item[self.input_audio_key] try: lang = model.get_label(audio_file, self.segment_duration, self.num_segments) @@ -1444,7 +1319,7 @@ def process(self): lang = None if lang: - item[self.output_lang_field] = lang + item[self.output_lang_key] = lang f.write(json.dumps(item, ensure_ascii=False) + '\n') @@ -1453,31 +1328,28 @@ class TextLid(BaseProcessor): A class for language identification (LID) of text using a pre-trained text classification model. Args: - input_text_field (str): The field in the dataset containing the text for language identification. + input_text_key (str): The field in the dataset containing the text for language identification. pretrained_model (str): The name or path of the pre-trained text classification model for language identification. - output_lang_field (str): The field to store the identified language for each text. + output_lang_key (str): The field to store the identified language for each text. device (str): The device to run the text classification model on (e.g., 'cuda', 'cpu'). If None, it automatically selects the available GPU if present; otherwise, it uses the CPU. drop_text_duplicates (bool, optional): If True, drops duplicate texts from the output manifest. Defaults to False. **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`. - Methods: - - process(): Processes the language identification for each text in the dataset and saves the results in a new manifest file. - """ def __init__( self, - input_text_field: str, + input_text_key: str, pretrained_model: str, - output_lang_field: str, + output_lang_key: str, device: str, drop_text_duplicates: bool = False, **kwargs, ): super().__init__(**kwargs) - self.input_text_field = input_text_field + self.input_text_key = input_text_key self.pretrained_model = pretrained_model - self.output_lang_field = output_lang_field + self.output_lang_key = output_lang_key self.device = device self.drop_duplicates = drop_text_duplicates @@ -1502,7 +1374,7 @@ def process(self): text_set = set() with Path(self.output_manifest_file).open('w') as f: for item in tqdm(manifest): - text = item[self.input_text_field] + text = item[self.input_text_key] if self.drop_duplicates and text not in text_set: text_set.add(text) if text: @@ -1511,7 +1383,7 @@ def process(self): lid = None if lid: - item[self.output_lang_field] = lid + item[self.output_lang_key] = lid f.write(json.dumps(item, ensure_ascii=False) + '\n') @@ -1520,8 +1392,8 @@ class AllVttText(BaseParallelProcessor): A class for extracting text content from VTT (WebVTT) files and updating the manifest. Args: - output_text_field (str): The field to store the extracted text content in the manifest. - input_filepath_field (str, optional): The field in the manifest containing the path to VTT files. Defaults to "vtt_filepath". + output_text_key (str): The field to store the extracted text content in the manifest. + input_filepath_key (str, optional): The field in the manifest containing the path to VTT files. Defaults to "vtt_filepath". **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. Methods: @@ -1531,20 +1403,20 @@ class AllVttText(BaseParallelProcessor): def __init__( self, - output_text_field: str, - input_filepath_field: str = "vtt_filepath", + output_text_key: str, + input_filepath_key: str = "vtt_filepath", **kwargs, ): super().__init__(**kwargs) - self.output_text_field = output_text_field - self.input_filepath_field = input_filepath_field + self.output_text_key = output_text_key + self.input_filepath_key = input_filepath_key def process_dataset_entry(self, data_entry): - vtt_file = data_entry[self.input_filepath_field] + vtt_file = data_entry[self.input_filepath_key] res_list = [DataEntry(data=None)] if os.path.isfile(vtt_file): try: - data_entry[self.output_text_field] = get_vtt_text(vtt_file) + data_entry[self.output_text_key] = get_vtt_text(vtt_file) res_list = [DataEntry(data=data_entry)] except Exception as e: logger.warning("AllVttText " + vtt_file + " " + str(e)) @@ -1557,7 +1429,7 @@ class TxtToVtt(BaseParallelProcessor): Args: vtt_files_dir (str): The directory where the generated VTT files will be saved. - key_field (str): The field in the manifest representing the unique key or identifier for each entry. + id_key (str): The field in the manifest representing the unique key or identifier for each entry. text_field (str): The field in the manifest containing the text content to be converted to VTT format. vtt_field (str): The field to store the generated VTT file paths in the manifest. **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. @@ -1571,16 +1443,16 @@ class TxtToVtt(BaseParallelProcessor): def __init__( self, vtt_files_dir: str, - key_field: str, - text_field: str, - vtt_field: str, + id_key: str, + text_key: str, + vtt_key: str, **kwargs, ): super().__init__(**kwargs) self.vtt_files_dir = vtt_files_dir - self.key_field = key_field - self.text_field = text_field - self.vtt_field = vtt_field + self.id_key = id_key + self.text_key = text_key + self.vtt_key = vtt_key self.trans_list = make_trans_list() @@ -1588,15 +1460,15 @@ def prepare(self): os.makedirs(self.vtt_files_dir, exist_ok=True) def process_dataset_entry(self, data_entry): - key = data_entry[self.key_field] - text_file = data_entry[self.text_field] + key = data_entry[self.id_key] + text_file = data_entry[self.text_key] os.makedirs(os.path.join(self.vtt_files_dir, key.split("/")[0]), exist_ok=True) vtt_file = os.path.join(self.vtt_files_dir, key) + ".vtt" txt2vtt(text_file, vtt_file, self.trans_list) - data_entry[self.vtt_field] = vtt_file + data_entry[self.vtt_key] = vtt_file return [DataEntry(data=data_entry)] @@ -1606,30 +1478,26 @@ class ReadParquet(BaseParallelProcessor): A class for reading information from Parquet files and updating the manifest with video URLs and captions. Args: - output_video_field (str): The field to store the extracted video URLs in the manifest. - output_caption_field (str): The field to store the extracted captions in the manifest. - key_field (str): The field in the manifest representing the unique key or identifier for each entry. + output_video_key (str): The field to store the extracted video URLs in the manifest. + output_caption_key (str): The field to store the extracted captions in the manifest. + id_key (str): The field in the manifest representing the unique key or identifier for each entry. raw_data_dir (str): The directory containing Parquet files with information to be read. **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. - Methods: - - prepare(): Reads and prepares information from Parquet files, storing it in the `urls` DataFrame. - - process_dataset_entry(data_entry): Processes a single dataset entry, extracts video URLs and captions based on the key, and updates the manifest. - """ def __init__( self, - output_video_field: str, - output_caption_field: str, - key_field: str, + output_video_key: str, + output_caption_key: str, + id_key: str, raw_data_dir: str, **kwargs, ): super().__init__(**kwargs) - self.output_video_field = output_video_field - self.output_caption_field = output_caption_field - self.key_field = key_field + self.output_video_key = output_video_key + self.output_caption_key = output_caption_key + self.id_key = id_key self.raw_data_dir = Path(raw_data_dir) def prepare(self): @@ -1646,14 +1514,14 @@ def prepare(self): logger.warning(str(e) + ", file: " + parquet) def process_dataset_entry(self, data_entry): - key = data_entry[self.key_field] + key = data_entry[self.id_key] key = key.split("/")[1] try: - data_entry[self.output_video_field] = self.urls.loc[key]['url'] - data_entry[self.output_caption_field] = self.urls.loc[key]['caption'] + data_entry[self.output_video_key] = self.urls.loc[key]['url'] + data_entry[self.output_caption_key] = self.urls.loc[key]['caption'] except: - data_entry[self.output_video_field] = "NN" - data_entry[self.output_caption_field] = "NN" + data_entry[self.output_video_key] = "NN" + data_entry[self.output_caption_key] = "NN" logger.warning("Key without URL or caption: " + key) return [DataEntry(data=data_entry)] @@ -1669,9 +1537,9 @@ class CreateInitialManifestCC(BaseParallelProcessor): Args: raw_data_dir (str): The directory containing image and text files to include in the initial dataset manifest. - video_field (str): The field to store the paths to the image files in the dataset. - key_field (str): The field to represent the common key or identifier for each entry. - text_field (str): The field to store the paths to the text files in the dataset. + video_key (str): The field to store the paths to the image files in the dataset. + id_key (str): The field to represent the common key or identifier for each entry. + text_key (str): The field to store the paths to the text files in the dataset. **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. Methods: @@ -1684,16 +1552,16 @@ class CreateInitialManifestCC(BaseParallelProcessor): def __init__( self, raw_data_dir: str, - video_field: str, - key_field: str, - text_field: str, + video_key: str, + id_key: str, + text_key: str, **kwargs, ): super().__init__(**kwargs) self.raw_data_dir = Path(raw_data_dir) - self.video_field = video_field - self.key_field = key_field - self.text_field = text_field + self.video_key = video_key + self.id_key = id_key + self.text_key = text_key def prepare(self): os.makedirs(self.raw_data_dir, exist_ok=True) @@ -1701,81 +1569,23 @@ def prepare(self): def read_manifest(self): videos = [str(self.raw_data_dir / video) for video in self.raw_data_dir.rglob('*.jpg')] texts = [str(self.raw_data_dir / text) for text in self.raw_data_dir.rglob('*.txt')] - v_df = pd.DataFrame({self.video_field: videos}) - t_df = pd.DataFrame({self.text_field: texts}) - - v_df[self.key_field] = v_df[self.video_field].apply(get_key) - t_df[self.key_field] = t_df[self.text_field].apply(get_key) - v_df = v_df.drop_duplicates(self.key_field) - t_df = t_df.drop_duplicates(self.key_field) - vt_df = v_df.merge(t_df, on=self.key_field, how="left") + v_df = pd.DataFrame({self.video_key: videos}) + t_df = pd.DataFrame({self.text_key: texts}) + + v_df[self.id_key] = v_df[self.video_key].apply(get_key) + t_df[self.id_key] = t_df[self.text_key].apply(get_key) + v_df = v_df.drop_duplicates(self.id_key) + t_df = t_df.drop_duplicates(self.id_key) + vt_df = v_df.merge(t_df, on=self.id_key, how="left") return vt_df.values def process_dataset_entry(self, data_entry): (video, key, text) = data_entry - data = {self.video_field: video, self.key_field: key, self.text_field: text} + data = {self.video_key: video, self.id_key: key, self.text_key: text} return [DataEntry(data=data)] -class FfmpegConvert(BaseParallelProcessor): - """ - A class for converting video files to audio using FFmpeg and updating the dataset with the path to the resampled audio. - - Args: - resampled_audio_dir (str): The directory to store the resampled audio files. - input_field (str): The field in the dataset representing the path to the input video files. - output_field (str): The field to store the path to the resampled audio files in the dataset. - key_field (str): The field in the dataset representing the unique key or identifier for each entry. - target_samplerate (int, optional): The target sampling rate for the resampled audio. Defaults to 16000. - target_nchannels (int, optional): The target number of channels for the resampled audio. Defaults to 1. - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. - - Methods: - process_dataset_entry(data_entry): Processes a single dataset entry, converts the input video to resampled audio, and updates the dataset. - - """ - - def __init__( - self, - resampled_audio_dir: str, - input_field: str, - output_field: str, - key_field: str = None, - target_samplerate: int = 16000, - target_nchannels: int = 1, - **kwargs, - ): - super().__init__(**kwargs) - self.input_field = input_field - self.output_field = output_field - self.key_field = key_field - self.resampled_audio_dir = resampled_audio_dir - self.target_samplerate = target_samplerate - self.target_nchannels = target_nchannels - - def prepare(self): - os.makedirs(self.resampled_audio_dir, exist_ok=True) - return super().prepare() - - def process_dataset_entry(self, data_entry): - input_file = data_entry[self.input_field] - if self.key_field: - key = data_entry[self.key_field] - os.makedirs(os.path.join(self.resampled_audio_dir, key.split("/")[0]), exist_ok=True) - else: - key = os.path.splitext(input_file)[0].split("/")[-1] - audio = os.path.join(self.resampled_audio_dir, key) + ".wav" - - if not os.path.isfile(audio): - ffmpeg_convert(input_file, audio, self.target_samplerate, self.target_nchannels) - - data_entry[self.output_field] = audio - if self.key_field: - data_entry[self.key_field] = key - return [DataEntry(data=data_entry)] - - class CreateInitialManifestExt(BaseParallelProcessor): """ A class for creating an initial dataset manifest from audio files with a specified extension. From 981d5bde0642a9ae8ee70c8a807993793bd9fbe0 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Fri, 15 Mar 2024 01:35:18 -0700 Subject: [PATCH 090/115] rm PreserveByValue Signed-off-by: Nikolay Karpov --- .../datasets/commoncrawl/commoncrawl.py | 45 ------------------- 1 file changed, 45 deletions(-) diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py index e982b9be..2d0baec6 100644 --- a/sdp/processors/datasets/commoncrawl/commoncrawl.py +++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py @@ -1044,51 +1044,6 @@ def process(self): write_jsonl(df4, self.output_manifest_file) -class PreserveByValue(BaseParallelProcessor): - """ - A class for preserving dataset entries based on a specified condition involving a target value and an input field. - - Parameters: - input_field (str): The field in the dataset entries to be evaluated. - target_value (Union[int, str]): The value to compare with the input field. - operator (str, optional): The operator to apply for comparison. Options: "lt" (less than), "le" (less than or equal to), - "eq" (equal to), "ne" (not equal to), "ge" (greater than or equal to), "gt" (greater than). Defaults to "eq". - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. - - """ - - def __init__( - self, - input_field: str, - target_value: Union[int, str], - operator: str = "eq", - **kwargs, - ): - super().__init__(**kwargs) - self.input_field = input_field - self.target_value = target_value - if operator == "lt": - self.operator = lt - elif operator == "le": - self.operator = le - elif operator == "eq": - self.operator = eq - elif operator == "ne": - self.operator = ne - elif operator == "ge": - self.operator = ge - elif operator == "gt": - self.operator = gt - - def process_dataset_entry(self, data_entry): - input_value = data_entry[self.input_field] - target = self.target_value - if self.operator(input_value, target): - return [DataEntry(data=data_entry)] - else: - return [DataEntry(data=None)] - - class Lang2Iso(BaseParallelProcessor): """ A class for converting language names to ISO language codes in a dataset. From a1e3fab87123452aea7b275dc689835f4baaee35 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Fri, 15 Mar 2024 01:41:59 -0700 Subject: [PATCH 091/115] black Signed-off-by: Nikolay Karpov --- .../datasets/commoncrawl/commoncrawl.py | 1 - .../datasets/commoncrawl/harv_utils.py | 117 ++++++++++-------- 2 files changed, 67 insertions(+), 51 deletions(-) diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py index 2d0baec6..045949fa 100644 --- a/sdp/processors/datasets/commoncrawl/commoncrawl.py +++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py @@ -24,7 +24,6 @@ ) from sdp.processors.datasets.commoncrawl.harv_utils import ( audio_duration, - ffmpeg_convert, get_vtt_text, load_manifest, make_trans_list, diff --git a/sdp/processors/datasets/commoncrawl/harv_utils.py b/sdp/processors/datasets/commoncrawl/harv_utils.py index 92b3ffd1..9c9ae837 100644 --- a/sdp/processors/datasets/commoncrawl/harv_utils.py +++ b/sdp/processors/datasets/commoncrawl/harv_utils.py @@ -1,15 +1,17 @@ +import json import os -import torch -# import ffmpeg # pip install ffmpeg-python -import webvtt # pip install webvtt-py -import subprocess, sys -import json, os -import soundfile as sf -from typing import Dict, List, Union +import subprocess +import sys from datetime import datetime -import numpy as np from pathlib import Path +from typing import Dict, List, Union + +import numpy as np import pandas as pd +import soundfile as sf +import torch +import webvtt # pip install webvtt-py + from sdp.logging import logger @@ -20,21 +22,23 @@ def read_jsonl(manifest_file): rec.append(json.loads(l)) return pd.DataFrame.from_records(rec) + def write_jsonl(df_in, manifest_filename): with open(manifest_filename, 'w') as the_file: for i, x in enumerate(df_in.itertuples()): r_dict = {} for column in df_in.columns: - r_dict[column] = getattr(x,column) + r_dict[column] = getattr(x, column) l1 = json.dumps(r_dict) - the_file.write(l1+'\n') + the_file.write(l1 + '\n') + def load_manifest(manifest: Path, keys: List[str] = []) -> List[Dict[str, Union[str, float]]]: result = [] r_dict = dict() for key in keys: r_dict[key] = list() - + with manifest.open() as f: for i, line in enumerate(f): data = json.loads(line) @@ -46,25 +50,29 @@ def load_manifest(manifest: Path, keys: List[str] = []) -> List[Dict[str, Union[ else: return result + def get_vtt_text(vtt_file): text_all = [] - if os.path.splitext(vtt_file)[1]=='.vtt': + if os.path.splitext(vtt_file)[1] == '.vtt': webvtt_i = webvtt.read - elif os.path.splitext(vtt_file)[1]=='.srt': + elif os.path.splitext(vtt_file)[1] == '.srt': webvtt_i = webvtt.from_srt else: - raise ValueError("Unsupported extention of file "+vtt_file) + raise ValueError("Unsupported extention of file " + vtt_file) for caption in webvtt_i(vtt_file): text = caption.text - if text.find("thumbnails")!=-1: + if text.find("thumbnails") != -1: pass else: text_all.append(' '.join(text.split('\n'))) return ' '.join(text_all) + def text2lid(text_model, tokenizer, text): - text_langs = "Arabic, Basque, Breton, Catalan, Chinese_China, Chinese_Hongkong, Chinese_Taiwan, Chuvash, Czech, Dhivehi, Dutch, English, Esperanto, Estonian, French, Frisian, Georgian, German, Greek, Hakha_Chin, Indonesian, Interlingua, Italian, Japanese, Kabyle, Kinyarwanda, Kyrgyz, Latvian, Maltese, Mongolian, Persian, Polish, Portuguese, Romanian, Romansh_Sursilvan, Russian, Sakha, Slovenian, Spanish, Swedish, Tamil, Tatar, Turkish, Ukranian, Welsh".split(", ") + text_langs = "Arabic, Basque, Breton, Catalan, Chinese_China, Chinese_Hongkong, Chinese_Taiwan, Chuvash, Czech, Dhivehi, Dutch, English, Esperanto, Estonian, French, Frisian, Georgian, German, Greek, Hakha_Chin, Indonesian, Interlingua, Italian, Japanese, Kabyle, Kinyarwanda, Kyrgyz, Latvian, Maltese, Mongolian, Persian, Polish, Portuguese, Romanian, Romansh_Sursilvan, Russian, Sakha, Slovenian, Spanish, Swedish, Tamil, Tatar, Turkish, Ukranian, Welsh".split( + ", " + ) inputs = tokenizer(text[:512], return_tensors="pt").to("cuda:0") with torch.no_grad(): text_logits = text_model(**inputs).logits @@ -74,80 +82,83 @@ def text2lid(text_model, tokenizer, text): def parse_hours(inp): inp_list = inp.split(":") - if len(inp_list) == 3 and int(inp_list[0])>=24: - hours = int(inp_list[0])%24 - days = int(inp_list[0])//24 + if len(inp_list) == 3 and int(inp_list[0]) >= 24: + hours = int(inp_list[0]) % 24 + days = int(inp_list[0]) // 24 if days < 31: - inp = str(1+days)+":"+str(hours)+":"+":".join(inp_list[1:]) + inp = str(1 + days) + ":" + str(hours) + ":" + ":".join(inp_list[1:]) return datetime.strptime(inp, '%d:%H:%M:%S.%f') else: - months = days//31 - days = days%31 - inp = str(1+months)+"/"+str(1+days)+" "+str(hours)+":"+":".join(inp_list[1:]) + months = days // 31 + days = days % 31 + inp = str(1 + months) + "/" + str(1 + days) + " " + str(hours) + ":" + ":".join(inp_list[1:]) return datetime.strptime(inp, '%m/%d %H:%M:%S.%f') else: return datetime.strptime(inp, '%H:%M:%S.%f') - + + def split_by_vtt(vtt_file, wav_file, wav_save_path): try: data, samplerate = sf.read(wav_file) target_sr = samplerate - if len(data.shape)>1: + if len(data.shape) > 1: data = np.mean(data, axis=1) _begin = datetime.strptime('00:00:00.000', '%H:%M:%S.%f') rel_vtt_file = '/'.join(os.path.splitext(vtt_file)[0].split('/')[-2:]) wav_list, text_list, dur_list = [], [], [] for caption in webvtt.read(vtt_file): _start = parse_hours(caption.start) - start = (_start-_begin).total_seconds() - start_sr = int(start*samplerate) + start = (_start - _begin).total_seconds() + start_sr = int(start * samplerate) _end = parse_hours(caption.end) - end = (_end-_begin).total_seconds() - end_sr = int(end*samplerate) + end = (_end - _begin).total_seconds() + end_sr = int(end * samplerate) text = ' '.join(caption.text.split('\n')) - wav_save_file = os.path.join(wav_save_path, rel_vtt_file, str(int(start*1000))+"-"+str(int(end*1000))+".wav") + wav_save_file = os.path.join( + wav_save_path, rel_vtt_file, str(int(start * 1000)) + "-" + str(int(end * 1000)) + ".wav" + ) os.makedirs(os.path.split(wav_save_file)[0], exist_ok=True) # number_of_samples = round(len(data[start_sr:end_sr]) * float(target_sr) / samplerate) # if number_of_samples > 0: - # if not os.path.exists(wav_save_file): - # data_sample = sps.resample(data[start_sr:end_sr], number_of_samples) + # if not os.path.exists(wav_save_file): + # data_sample = sps.resample(data[start_sr:end_sr], number_of_samples) data_sample = data[start_sr:end_sr] sf.write(wav_save_file, data_sample, target_sr) text_list.append(text) wav_list.append(wav_save_file) - dur_list.append(data_sample.shape[0]/samplerate) #(_end-_start).total_seconds() + dur_list.append(data_sample.shape[0] / samplerate) # (_end-_start).total_seconds() return wav_list, text_list, dur_list except Exception as e: logger.warning(str(e) + vtt_file) return None, None, None - + + def split_by_vtt_new(vtt_file, samplerate): try: _begin = datetime.strptime('00:00:00.000', '%H:%M:%S.%f') text_list, start_s, end_s = [], [], [] - if os.path.splitext(vtt_file)[1]=='.vtt': + if os.path.splitext(vtt_file)[1] == '.vtt': webvtt_i = webvtt.read - elif os.path.splitext(vtt_file)[1]=='.srt': + elif os.path.splitext(vtt_file)[1] == '.srt': webvtt_i = webvtt.from_srt else: - raise ValueError("Unsupporte extention of file "+vtt_file) - + raise ValueError("Unsupporte extention of file " + vtt_file) - for caption in webvtt_i(vtt_file): + for caption in webvtt_i(vtt_file): text = ' '.join(caption.text.split('\n')) _start = parse_hours(caption.start) - start = (_start-_begin).total_seconds() - start_sr = int(start*samplerate) + start = (_start - _begin).total_seconds() + start_sr = int(start * samplerate) _end = parse_hours(caption.end) - end = (_end-_begin).total_seconds() - end_sr = int(end*samplerate) - + end = (_end - _begin).total_seconds() + end_sr = int(end * samplerate) + text_list.append(text.strip()) start_s.append(start_sr) end_s.append(end_sr) @@ -156,9 +167,11 @@ def split_by_vtt_new(vtt_file, samplerate): logger.warning(str(e) + vtt_file) return None, None, None + def audio_duration(fname): data, samplerate = sf.read(fname) - return data.shape[0]/samplerate + return data.shape[0] / samplerate + def ffmpeg_convert(jpg: str, wav: str, ar: int = 0, ac: int = 1): process_args = ["ffmpeg", "-i", jpg, '-ac', str(ac), "-map", "0:a", "-c:a", "pcm_s16le", "-y", wav] @@ -166,30 +179,34 @@ def ffmpeg_convert(jpg: str, wav: str, ar: int = 0, ac: int = 1): if ar: process_args = process_args[:-1] process_args.extend(["-ar", str(ar), wav]) - return subprocess.run(process_args, stdout = subprocess.DEVNULL, stderr = subprocess.DEVNULL) + return subprocess.run(process_args, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + def read_txt(txt_file): with open(txt_file, "r") as f: text = f.read() return text[2:-1].replace("\\n", "\n").replace("\\r", "\r") - + + def translate(txt, trans_list): for trans in trans_list: txt = txt.replace(trans[0], trans[1]) return txt + def txt2vtt(txt_file: str, vtt_file: str, trans_list: List): txt = read_txt(txt_file) if txt: if txt[:6] == "WEBVTT": pass else: - txt = "WEBVTT"+txt -# print(f"'{txt[:7]}''") + txt = "WEBVTT" + txt + # print(f"'{txt[:7]}''") vtt = translate(txt, trans_list) with open(vtt_file, "w") as f: f.write(vtt) + def make_trans_list(): t1 = """U+0000   U+0001 \' \\' @@ -836,5 +853,5 @@ def make_trans_list(): trans_list = [] for a in t1.split('\n'): b = a.split("\t") - trans_list.append((b[2],b[1])) + trans_list.append((b[2], b[1])) return trans_list From ab8c685764a7238c32d07670edf1f61d0cde1b77 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Mon, 18 Mar 2024 23:19:44 -0700 Subject: [PATCH 092/115] rm operator Signed-off-by: Nikolay Karpov --- sdp/processors/datasets/commoncrawl/commoncrawl.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py index 045949fa..77a9ddaa 100644 --- a/sdp/processors/datasets/commoncrawl/commoncrawl.py +++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py @@ -4,7 +4,6 @@ import re import shutil import subprocess -from operator import eq, ge, gt, le, lt, ne from pathlib import Path from typing import Dict, List, Union From f31f7d1d27f9e7f4fde00e9b8e34799802bd8bac Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Tue, 19 Mar 2024 09:29:45 -0700 Subject: [PATCH 093/115] batch_size > 1 Signed-off-by: Nikolay Karpov --- .../huggingface/speech_recognition.py | 26 ++++++++++++++----- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/sdp/processors/huggingface/speech_recognition.py b/sdp/processors/huggingface/speech_recognition.py index d8702246..12a4e5fa 100644 --- a/sdp/processors/huggingface/speech_recognition.py +++ b/sdp/processors/huggingface/speech_recognition.py @@ -95,9 +95,11 @@ class ASRTransformers(BaseProcessor): Args: pretrained_model (str): name of pretrained model on HuggingFace. - output_text_field (str): field to save transcription result. + output_text_key (str): Key to save transcription result. + input_audio_key (str): Key to read audio file. Defaults to "audio_filepath". + input_duration_key (str): Audio duration key. Defaults to "duration". device (str): Inference device. - batch_size (int): Inference batch size. Defaults to 1. TODO: support batch_size > 1 + batch_size (int): Inference batch size. Defaults to 1. torch_dtype (str): Tensor data type. Default to "float32" """ @@ -105,6 +107,8 @@ def __init__( self, pretrained_model: str, output_text_key: str, + input_audio_key: str = "audio_filepath", + input_duration_key: str = "duration", device: str = None, batch_size: int = 1, torch_dtype: str = "float32", @@ -119,7 +123,9 @@ def __init__( logger.warning("This is an example processor, for demonstration only. Do not use it for production purposes.") self.pretrained_model = pretrained_model + self.input_audio_key = input_audio_key self.output_text_key = output_text_key + self.input_duration_key = input_duration_key self.device = device self.batch_size = batch_size if torch_dtype == "float32": @@ -156,12 +162,18 @@ def __init__( def process(self): json_list = load_manifest(Path(self.input_manifest_file)) + json_list_sorted = sorted(json_list, key=lambda d: d[self.input_duration_key], reverse=True) Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True) with Path(self.output_manifest_file).open('w') as f: - for item in tqdm(json_list): - pred_text = self.pipe(item["audio_filepath"])["text"] - - item[self.output_text_key] = pred_text - f.write(json.dumps(item, ensure_ascii=False) + '\n') + start_index = 0 + for _ in tqdm(range(len(json_list_sorted) // self.batch_size)): + batch = json_list_sorted[start_index : start_index + self.batch_size] + start_index += self.batch_size + audio_files = [item[self.input_audio_key] for item in batch] + results = self.pipe(audio_files) + + for i, item in enumerate(batch): + item[self.output_text_key] = results[i]["text"] + f.write(json.dumps(item, ensure_ascii=False) + '\n') From 02b35a8caaaca997907115acb467017938d46708 Mon Sep 17 00:00:00 2001 From: Sasha Meister <117230141+ssh-meister@users.noreply.github.com> Date: Tue, 19 Mar 2024 20:51:44 +0100 Subject: [PATCH 094/115] German Youtube with new processors (#49) * YouTube German config and new processors Signed-off-by: Sasha Meister * Added Merge Manifests processor Signed-off-by: Sasha Meister * Clean de.yaml pipeline config Signed-off-by: Sasha Meister * Fix Lang2Iso Signed-off-by: Sasha Meister * fix typo * fix empty list error - IndexError: list index out of range * Added requirements.txt Signed-off-by: Sasha Meister * Fixed paths for audio TN Signed-off-by: Sasha Meister * Updated requirements.txt Signed-off-by: Sasha Meister --------- Signed-off-by: Sasha Meister --- dataset_configs/youtube/de.yaml | 253 ++++++++++++++++++ sdp/processors/__init__.py | 2 +- .../datasets/commoncrawl/__init__.py | 8 +- .../datasets/commoncrawl/commoncrawl.py | 2 +- sdp/processors/datasets/youtube/__init__.py | 18 ++ .../datasets/youtube/aggregate_segments.py | 71 +++++ .../youtube/create_initial_manifest.py | 90 +++++++ .../datasets/youtube/merge_manifests.py | 35 +++ .../datasets/youtube/requirements.txt | 2 + sdp/processors/datasets/youtube/utils.py | 103 +++++++ sdp/processors/nemo/asr_inference.py | 32 +++ .../nemo/transcribe_speech_parallel.py | 208 ++++++++++++++ 12 files changed, 818 insertions(+), 6 deletions(-) create mode 100644 dataset_configs/youtube/de.yaml create mode 100644 sdp/processors/datasets/youtube/__init__.py create mode 100644 sdp/processors/datasets/youtube/aggregate_segments.py create mode 100644 sdp/processors/datasets/youtube/create_initial_manifest.py create mode 100644 sdp/processors/datasets/youtube/merge_manifests.py create mode 100644 sdp/processors/datasets/youtube/requirements.txt create mode 100644 sdp/processors/datasets/youtube/utils.py create mode 100644 sdp/processors/nemo/transcribe_speech_parallel.py diff --git a/dataset_configs/youtube/de.yaml b/dataset_configs/youtube/de.yaml new file mode 100644 index 00000000..333536b1 --- /dev/null +++ b/dataset_configs/youtube/de.yaml @@ -0,0 +1,253 @@ +processors_to_run: "3:" +base_dir: "/data/supervised/2/audios" +workspace_dir: "/data/processed/2" + +# filters +lang: de +min_duration: 1.0 +max_duration: 40.0 +max_wer: 75.0 +max_cer: 30.0 + + +processors: + # Create initial manifests based on pairs of .opus audio + .srt transcript (with ground-truth timestamps) + - _target_: sdp.processors.datasets.youtube.CreateInitialManifest + data_dir: ${base_dir} + output_audio_dir: ${workspace_dir}/audio/wav_samples + output_manifest_file: ${workspace_dir}/manifest1.json + chunksize: 10 + in_memory_chunksize: 400 + + # Aggregate ground-truth segments to longer one based on duration threshold + - _target_: sdp.processors.datasets.youtube.AggregateSegments + max_duration: ${max_duration} + output_segments_audio_dir: ${workspace_dir}/audio/wav_segments + output_manifest_file: ${workspace_dir}/manifest2.json + + # Filter out samples which duration is out of range 0-40 sec. + - _target_: sdp.processors.DropHighLowDuration + output_manifest_file: ${workspace_dir}/manifest3.json + low_duration_threshold: ${min_duration} + high_duration_threshold: ${max_duration} + + # Identify language of the text + - _target_: sdp.processors.datasets.commoncrawl.TextLid + output_manifest_file: ${workspace_dir}/manifest4.json + input_text_key: orig_text + output_lang_key: text_lang + pretrained_model: "jb2k/bert-base-multilingual-cased-language-detection" + device: cuda + drop_text_duplicates: True + + - _target_: sdp.processors.datasets.commoncrawl.Lang2Iso + output_manifest_file: ${workspace_dir}/manifest5.json + input_lang_key: text_lang + output_lang_key: text_lang + + ## Filter out samples with text in non-target language + - _target_: sdp.processors.PreserveByValue + output_manifest_file: ${workspace_dir}/manifest6.json + input_value_key: text_lang + target_value: ${lang} + + # Identify language of the audio + - _target_: sdp.processors.datasets.commoncrawl.AudioLid + output_manifest_file: ${workspace_dir}/manifest7.json + input_audio_key: audio_filepath + output_lang_key: audio_lang + device: cuda + pretrained_model: "langid_ambernet" + + ## Filter out samples with audio in non-target language + - _target_: sdp.processors.PreserveByValue + output_manifest_file: ${workspace_dir}/manifest8.json + input_value_key: audio_lang + target_value: ${lang} + + # ASR Inference + - _target_: sdp.processors.ASRInferenceParallel + output_manifest_file: ${workspace_dir}/manifest9.json + pretrained_model: nvidia/stt_${lang}_fastconformer_hybrid_large_pc + batch_size: 64 + devices: 4 + + ## Merge manifests + - _target_: sdp.processors.datasets.youtube.MergeManifests + input_manifest_file: ${workspace_dir}/manifest8.json + input_manifest_file2: ${workspace_dir}/manifest9.json + output_manifest_file: ${workspace_dir}/manifest10.json + key_field: audio_filepath + fields_to_merge: + - {"pred_text" : "pred_text_pc"} + + # Filter out samples with empty pred_text_pc + - _target_: sdp.processors.DropIfRegexMatch + output_manifest_file: ${workspace_dir}/manifest11.json + text_key: pred_text_pc + regex_patterns: + - "^\\s*$" + + # Preprocess orig text for audio-based TN + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest12.json + duplicate_fields: {"orig_text" : "pre_normalized"} + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest13.json + text_key: pre_normalized + regex_params_list: + - {"pattern": '\\[hn]', "repl" : " "} + - {"pattern": "\\s+", "repl" : " "} + - {"pattern": "\\[", "repl" : " "} + - {"pattern": "\\]", "repl" : " "} + - {"pattern": "!", "repl" : "."} + - {"pattern": "\\)", "repl" : " "} + - {"pattern": "\\(", "repl" : " "} + - {"pattern": "“", "repl" : " "} + - {"pattern": "„", "repl" : " "} + - {"pattern": "–", "repl" : " "} + - {"pattern": ";", "repl" : ","} + - {"pattern": "'", "repl" : " "} + - {"pattern": "…", "repl" : "."} + - {"pattern": "«", "repl" : " "} + - {"pattern": "»", "repl" : " "} + - {"pattern": "’", "repl" : " "} + - {"pattern": "‘", "repl" : " "} + - {"pattern": "”", "repl" : " "} + - {"pattern": "—", "repl" : " "} + - {"pattern": "´", "repl" : " "} + - {"pattern": "″", "repl" : " "} + - {"pattern": "`", "repl" : " "} + - {"pattern": "\\|", "repl" : " "} + - {"pattern": "−", "repl" : " "} + - {"pattern": "‟", "repl" : " "} + - {"pattern": "‒", "repl" : " "} + - {"pattern": " ", "repl" : " "} + - {"pattern": "", "repl" : " "} + - {"pattern": "‐", "repl" : " "} + - {"pattern": "ʻ", "repl" : " "} + - {"pattern": "′", "repl" : " "} + - {"pattern": "\\\\", "repl" : " "} + - {"pattern": "^\\s?\\.\\.\\.", "repl" : ""} + - {"pattern": "\\s?\\.\\.\\.$", "repl" : "."} + + ## Remove extra space + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest14.json + text_key: pre_normalized + regex_params_list: + - {"pattern": "\\s+", "repl" : " "} + - {"pattern": "^\\s+", "repl" : ""} + - {"pattern": "\\s+$", "repl" : ""} + + ## Filter out samples out of Regex + - _target_: sdp.processors.DropIfNoneOfRegexMatch + output_manifest_file: ${workspace_dir}/manifest15.json + text_key: pre_normalized + regex_patterns: + - "^[ !#$%&'*+,\\-.0-9:=?ABCDEFGHIJKLMNOPQRSTUVWXYZ^_abcdefghijklmnopqrstuvwxyz{}~£¥°²³µÄÖÜßäöüμω₩€/]+$" + + # Run audio based TN + - _target_: sdp.processors.datasets.commoncrawl.Subprocess + output_manifest_file: ${workspace_dir}/manifest16.json + input_manifest_arg: "--manifest" + output_manifest_arg: "--output_filename" + arg_separator: "=" + cmd: "python /workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \ + --language=${lang} --n_jobs=-1 --batch_size=600 --manifest_text_field=pre_normalized --manifest_asr_pred_field=pred_text_pc \ + --cache_dir=${workspace_dir}/cache \ + --whitelist=/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/${lang}/data/whitelist.tsv" + + # Post-normalization processing + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest17.json + duplicate_fields: {"normalized" : "post_normalized"} + + ## Extra chars removing from normalized text + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest18.json + text_key: post_normalized + regex_params_list: + - {"pattern": "['\\-:{}\\/]", "repl" : " "} + - {"pattern": "!", "repl" : "."} + - {"pattern": "\\s+", "repl" : " "} + - {"pattern": "^\\s+", "repl" : ""} + - {"pattern": "\\s+$", "repl" : ""} + + ## Remove samples with chars out of list (letters, comma, period, question mark, space) + - _target_: sdp.processors.DropIfNoneOfRegexMatch + output_manifest_file: ${workspace_dir}/manifest19.json + text_key: post_normalized + regex_patterns: + - "^[a-zA-ZäÄöÖüÜß,\\.?\\s]+$" + + # Create text field with lowercased clean "post_normalized" + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest20.json + duplicate_fields: {"post_normalized" : "text"} + + - _target_: sdp.processors.SubMakeLowercase + output_manifest_file: ${workspace_dir}/manifest21.json + text_key: "text" + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest22.json + text_key: "text" + regex_params_list: + - {"pattern": "[\\.\\?\\,]", "repl" : " "} + - {"pattern": "\\s+", "repl" : " "} + - {"pattern": "^\\s+", "repl" : ""} + - {"pattern": "\\s+$", "repl" : ""} + + # Create pred_text field with lowercased clean "pred_text_pc" + - _target_: sdp.processors.DuplicateFields + output_manifest_file: ${workspace_dir}/manifest23.json + duplicate_fields: {"pred_text_pc" : "pred_text"} + + - _target_: sdp.processors.SubMakeLowercase + output_manifest_file: ${workspace_dir}/manifest24.json + text_key: "pred_text" + + - _target_: sdp.processors.SubRegex + output_manifest_file: ${workspace_dir}/manifest25.json + text_key: "pred_text" + regex_params_list: + - {"pattern": "[\\.\\?\\,]", "repl" : " "} + - {"pattern": "\\s+", "repl" : " "} + - {"pattern": "^\\s+", "repl" : ""} + - {"pattern": "\\s+$", "repl" : ""} + + # Filtration + - _target_: sdp.processors.DropHighCER + output_manifest_file: ${workspace_dir}/manifest26.json + cer_threshold: ${max_cer} + text_key: "text" + pred_text_key: "pred_text" + + - _target_: sdp.processors.DropHighWER + output_manifest_file: ${workspace_dir}/manifest27.json + wer_threshold: ${max_wer} + text_key: "text" + pred_text_key: "pred_text" + + # Finalization + - _target_: sdp.processors.KeepOnlySpecifiedFields + output_manifest_file: ${workspace_dir}/manifest28.json + fields_to_keep: ["audio_filepath", "duration", "post_normalized"] + + - _target_: sdp.processors.RenameFields + output_manifest_file: ${workspace_dir}/manifest29.json + rename_fields: {"post_normalized":"text"} + + - _target_: sdp.processors.datasets.commoncrawl.CopyFiles + file_field: audio_filepath + path_to_copy: ${workspace_dir}/clean_data/audio/ + path_levels: 1 + + - _target_: sdp.processors.datasets.commoncrawl.DropAbsPath + output_manifest_file: ${workspace_dir}/clean_data/${lang}_manifest.json + path_key: audio_filepath + abs_path_to_drop: ${workspace_dir} + + diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py index f7a896e1..2ab441c5 100644 --- a/sdp/processors/__init__.py +++ b/sdp/processors/__init__.py @@ -74,5 +74,5 @@ from sdp.processors.modify_manifest.make_letters_uppercase_after_period import ( MakeLettersUppercaseAfterPeriod, ) -from sdp.processors.nemo.asr_inference import ASRInference +from sdp.processors.nemo.asr_inference import ASRInference, ASRInferenceParallel from sdp.processors.nemo.pc_inference import PCInference diff --git a/sdp/processors/datasets/commoncrawl/__init__.py b/sdp/processors/datasets/commoncrawl/__init__.py index b4fe3020..7ee1c072 100644 --- a/sdp/processors/datasets/commoncrawl/__init__.py +++ b/sdp/processors/datasets/commoncrawl/__init__.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .commoncrawl import UseSonar, BLEUScore, Subprocess, NmtSubprocess, PreserveByValue, \ - Lang2Iso, SplitByVttSentence, SplitByVtt, AudioLid, TextLid, AllVttText, TxtToVtt, \ - ReadParquet, CreateInitialManifestCC, FfmpegConvert, ASR_HF, AlignerSubprocess, \ - SplitByAligner, JoinBy, EvalBandwidth, CreateInitialManifestExt, AudioDuration, \ +from .commoncrawl import UseSonar, BLEUScore, Subprocess, NmtSubprocess, \ + Lang2Iso, SplitByVttSentence, AudioLid, TextLid, AllVttText, TxtToVtt, \ + ReadParquet, CreateInitialManifestCC, ASR_HF, AlignerSubprocess, \ + SplitByAligner, JoinBy, EvalBandwidth, CreateInitialManifestExt, \ TrainDevTestSplitCC, DropAbsPath, GetSpecificFiles, CopyFiles, ManifestToUtf8 diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py index 77a9ddaa..4f441979 100644 --- a/sdp/processors/datasets/commoncrawl/commoncrawl.py +++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py @@ -1106,7 +1106,7 @@ def __init__( } def process_dataset_entry(self, data_entry): - data_entry[self.output_lang_key] = self.iso_m[data_entry[self.input_lang_key]] + data_entry[self.output_lang_key] = self.iso_m.get(data_entry[self.input_lang_key], None) return [DataEntry(data=data_entry)] diff --git a/sdp/processors/datasets/youtube/__init__.py b/sdp/processors/datasets/youtube/__init__.py new file mode 100644 index 00000000..119ac1ca --- /dev/null +++ b/sdp/processors/datasets/youtube/__init__.py @@ -0,0 +1,18 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .create_initial_manifest import CreateInitialManifest +from .utils import parse_srt +from .aggregate_segments import * +from .merge_manifests import MergeManifests \ No newline at end of file diff --git a/sdp/processors/datasets/youtube/aggregate_segments.py b/sdp/processors/datasets/youtube/aggregate_segments.py new file mode 100644 index 00000000..d97524c4 --- /dev/null +++ b/sdp/processors/datasets/youtube/aggregate_segments.py @@ -0,0 +1,71 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from pydub import AudioSegment +import os + +from sdp.processors.base_processor import BaseParallelProcessor +from sdp.processors.datasets.youtube.utils import RawSegment, AggregatedSegment, get_audio_segment + + +class AggregateSegments(BaseParallelProcessor): + def __init__( + self, + max_duration: float = 40.0, + crop_audio_segments: bool = True, + output_segments_audio_dir: str = None, + **kwargs, + ): + super().__init__(**kwargs) + self.max_duration = max_duration + self.crop_audio_segments = crop_audio_segments + self.output_segments_audio_dir = output_segments_audio_dir + + def prepare(self): + if self.crop_audio_segments and self.output_segments_audio_dir: + os.makedirs(os.path.join(self.output_segments_audio_dir), exist_ok=True) + + def process_dataset_entry(self, data_entry: dict): + sample_id = data_entry['sample_id'] + segments = data_entry['segments'] + agg_segments = [] + + if len(segments) == 0: + return agg_segments + + first_segment = RawSegment(**segments[0]) + agg_segment = AggregatedSegment(segment=first_segment, segment_id=1, sample_id=sample_id, + output_audio_dir = self.output_segments_audio_dir) + + for segment in segments[1 : ]: + segment = RawSegment(**segment) + + if (not agg_segment.duration_match or + agg_segment.duration >= self.max_duration or + segment.end_time - agg_segment.start_time >= self.max_duration): + agg_segments.append(agg_segment.to_dataentry()) + agg_segment = AggregatedSegment(segment=segment, + segment_id=len(agg_segments) + 1, sample_id=sample_id, + output_audio_dir = self.output_segments_audio_dir) + else: + agg_segment.aggregate(segment) + else: + agg_segments.append(agg_segment.to_dataentry()) + + if self.crop_audio_segments: + audio = AudioSegment.from_wav(data_entry['audio_filepath']) + for agg_segment in agg_segments: + get_audio_segment(audio = audio, + start_time = agg_segment.data['start_time'], + end_time = agg_segment.data['end_time'], + output_audio_filepath = agg_segment.data['audio_filepath']) + + return agg_segments \ No newline at end of file diff --git a/sdp/processors/datasets/youtube/create_initial_manifest.py b/sdp/processors/datasets/youtube/create_initial_manifest.py new file mode 100644 index 00000000..3bca6ee1 --- /dev/null +++ b/sdp/processors/datasets/youtube/create_initial_manifest.py @@ -0,0 +1,90 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from typing import Dict +from glob import glob + +from sdp.logging import logger +from sdp.processors.base_processor import BaseParallelProcessor, DataEntry +from sdp.processors.datasets.youtube.utils import parse_srt, Sample +from sdp.utils.common import ffmpeg_convert + +class CreateInitialManifest(BaseParallelProcessor): + def __init__( + self, + data_dir: str, + output_audio_dir: str, + audio_file_extenstion: str = ".opus", + target_samplerate: int = 16000, + target_nchannels: int = 1, + **kwargs, + ): + super().__init__(**kwargs) + self.data_dir = data_dir + self.output_audio_dir = output_audio_dir + self.audio_file_extenstion = audio_file_extenstion + self.target_samplerate = target_samplerate + self.target_nchannels = target_nchannels + + def _get_manifest(self): + audio_filepaths = glob(f"{self.data_dir}/*{self.audio_file_extenstion}") + samples = [] + for audio_filepath in audio_filepaths: + sample = Sample(orig_audio_filepath = audio_filepath) + sample.sample_id = os.path.basename(audio_filepath).replace(self.audio_file_extenstion, "") # Get sample_id + + # Get .srt file, which relaterd to source audio + srt_filepaths = glob(f"{self.data_dir}/{sample.sample_id}.*.srt") + + if len(srt_filepaths) < 1: + logger.warning(f"Sample \"{sample.sample_id}\" has no related .srt files. Skipping") + continue + + srt_filepath = srt_filepaths[0] + if len(srt_filepaths) > 1: + logger.warning(f"Sample \"{sample.sample_id}\" has multiple related .srt files: {', '.join(srt_filepaths)}. \ + Only first file will be used for parsing - {srt_filepaths[0]}, other related .srt files will be skipped.") + + sample.srt_filepath = srt_filepath + samples.append(sample.to_dataentry()) + + return samples + + def prepare(self): + os.makedirs(os.path.join(self.output_audio_dir), exist_ok=True) + + def read_manifest(self): + data_entries = self._get_manifest() + return data_entries + + def process_dataset_entry(self, data_entry: DataEntry): + # Convert source_audio_filepath to .wav + data_entry.data['audio_filepath'] = os.path.join(self.output_audio_dir, os.path.basename(data_entry.data['orig_audio_filepath']).replace(self.audio_file_extenstion, ".wav")) + + ffmpeg_convert(input_file=data_entry.data['orig_audio_filepath'], + output_wav=data_entry.data['audio_filepath'], + sample_rate=self.target_samplerate, + num_channels=self.target_nchannels) + + if not os.path.exists(data_entry.data['audio_filepath']): + return [] + + # Parse segments from .srt + segments = parse_srt(data_entry.data['srt_filepath'], verify_duration = True, wav_filepath=data_entry.data['audio_filepath']) + + if len(segments) > 0: + data_entry.data['segments'] = [segment.__dict__ for segment in segments] + + return [data_entry] \ No newline at end of file diff --git a/sdp/processors/datasets/youtube/merge_manifests.py b/sdp/processors/datasets/youtube/merge_manifests.py new file mode 100644 index 00000000..0860c429 --- /dev/null +++ b/sdp/processors/datasets/youtube/merge_manifests.py @@ -0,0 +1,35 @@ +from sdp.processors.base_processor import BaseParallelProcessor, DataEntry +import json + +class MergeManifests(BaseParallelProcessor): + def __init__( + self, input_manifest_file2: str, fields_to_merge: dict, key_field: str = "audio_filepath", + **kwargs + ): + super().__init__(**kwargs) + self.input_manifest_file2 = input_manifest_file2 + self.manifest2_dict = {} + self.fields_to_merge = fields_to_merge + self.key_field = key_field + + def prepare(self): + with open(self.input_manifest_file2, 'r') as manifest: + line = manifest.readline() + while line: + whole_sample = json.loads(line) + key_value = whole_sample[self.key_field] + sample = {} + for field_names_dict in self.fields_to_merge: + curr_field_name = list(field_names_dict.keys())[0] + sample[curr_field_name] = whole_sample[curr_field_name] + + self.manifest2_dict[key_value] = sample + line = manifest.readline() + + def process_dataset_entry(self, data_entry: dict): + key_value = data_entry[self.key_field] + for field_names_dict in self.fields_to_merge: + curr_field_name = list(field_names_dict.keys())[0] + new_field_name = field_names_dict[curr_field_name] + data_entry[new_field_name] = self.manifest2_dict[key_value][curr_field_name] + return [DataEntry(data=data_entry)] \ No newline at end of file diff --git a/sdp/processors/datasets/youtube/requirements.txt b/sdp/processors/datasets/youtube/requirements.txt new file mode 100644 index 00000000..6f677747 --- /dev/null +++ b/sdp/processors/datasets/youtube/requirements.txt @@ -0,0 +1,2 @@ +pysrt +webvtt-py \ No newline at end of file diff --git a/sdp/processors/datasets/youtube/utils.py b/sdp/processors/datasets/youtube/utils.py new file mode 100644 index 00000000..9f5c9c5e --- /dev/null +++ b/sdp/processors/datasets/youtube/utils.py @@ -0,0 +1,103 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pysrt +from pydub import AudioSegment +from dataclasses import dataclass +import re +import os +from sdp.processors.base_processor import DataEntry + + +@dataclass +class RawSegment: + segment_id: int = None + start_time: float = None + end_time: float = None + duration: str = None + duration_match: bool = None + orig_text: str = None + + def to_dataentry(self): + return DataEntry(data = self.__dict__) + + +class AggregatedSegment(RawSegment): + def __init__(self, segment: dict, segment_id: int, sample_id: str, output_audio_dir: str): + super().__init__(**segment.__dict__) + self.segment_id = f"{sample_id}_{str(segment_id).zfill(4)}" + self.audio_filepath = os.path.join(output_audio_dir, f'{self.segment_id}.wav') if output_audio_dir is not None else None + + def aggregate(self, segment): + self.end_time = segment.end_time + self.duration = self.end_time - self.start_time + self.orig_text = re.sub("\s+", " ", f"{self.orig_text} {segment.orig_text}".strip()) + +@dataclass +class Sample: + sample_id: str = None + srt_filepath: str = None + orig_audio_filepath: str = None + audio_filepath: str = None + segments: list[RawSegment | AggregatedSegment] = None + + def to_dataentry(self): + data = self.__dict__ + data['segments'] = [segment.data.__dict__ for segment in data['segments']] if data['segments'] is not None else [] + return DataEntry(data = data) + + +def get_audio_segment(audio, start_time: float, end_time: float, output_audio_filepath: str = None): + start_time = start_time * 1000 + end_time = end_time * 1000 + audio_segment = audio[start_time : end_time] + + if output_audio_filepath: + audio_segment.export(output_audio_filepath, format="wav") + return audio_segment + + +def get_audio_segment_duration(audio, start_time, end_time): + audio_segment = get_audio_segment(audio, start_time, end_time) + return audio_segment.duration_seconds + + +def parse_srt(srt_filepath, verify_duration: bool = True, wav_filepath: str = None): + subs = pysrt.open(srt_filepath) + srt_segments = [] + + if verify_duration and wav_filepath: + audio = AudioSegment.from_wav(wav_filepath) + else: + audio = None + + epsilon = 1e-2 + + for sub in subs: + segment = RawSegment(segment_id = sub.index, + start_time = sub.start.ordinal / 1000, + end_time = sub.end.ordinal / 1000, + orig_text = sub.text_without_tags) + + duration_by_timestemps = segment.end_time - segment.start_time + + if audio: + segment.duration = get_audio_segment_duration(audio, segment.start_time, segment.end_time) + segment.duration_match = abs(segment.duration - duration_by_timestemps) < epsilon + else: + segment.duration = duration_by_timestemps + + srt_segments.append(segment) + + return srt_segments \ No newline at end of file diff --git a/sdp/processors/nemo/asr_inference.py b/sdp/processors/nemo/asr_inference.py index 5af6e254..5c2c1bcb 100644 --- a/sdp/processors/nemo/asr_inference.py +++ b/sdp/processors/nemo/asr_inference.py @@ -14,6 +14,7 @@ import os import subprocess +import shutil from pathlib import Path from sdp.processors.base_processor import BaseProcessor @@ -74,3 +75,34 @@ def process(self): shell=True, check=True, ) + + +class ASRInferenceParallel(BaseProcessor): + def __init__( + self, + pretrained_model: str, + batch_size: int = 32, + devices: int = 2, + **kwargs, + ): + super().__init__(**kwargs) + self.script_path = Path(__file__).parents[1] / "nemo" / "transcribe_speech_parallel.py" + self.pretrained_model = pretrained_model + self.batch_size = batch_size + self.devices = devices + self.output_manifest_dir = self.output_manifest_file.replace(".json", "") + + def process(self): + subprocess.run( + f"python {self.script_path} " + f"model={self.pretrained_model} " + f"predict_ds.manifest_filepath={self.input_manifest_file} " + f"output_path={self.output_manifest_dir} " + f"predict_ds.batch_size={self.batch_size} " + f"trainer.devices={self.devices} ", + shell=True, + check=True, + ) + + os.rename(os.path.join(self.output_manifest_dir, "predictions_all.json"), self.output_manifest_file) + shutil.rmtree(self.output_manifest_dir) \ No newline at end of file diff --git a/sdp/processors/nemo/transcribe_speech_parallel.py b/sdp/processors/nemo/transcribe_speech_parallel.py new file mode 100644 index 00000000..c0af8f97 --- /dev/null +++ b/sdp/processors/nemo/transcribe_speech_parallel.py @@ -0,0 +1,208 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +# ASR transcribe/inference with multi-GPU/multi-node support for large datasets +# It supports both tarred and non-tarred datasets +# Arguments +# model: path to a nemo/PTL checkpoint file or name of a pretrained model +# predict_ds: config of the dataset/dataloader +# output_path: path to store the predictions +# return_predictions: whether to return the predictions as output other than writing into the files +# use_cer: whether to calculate the error in terms of CER or use the default WER +# +# Results of each GPU/worker is written into a file named 'predictions_{rank}.json, and aggregated results of all workers are written into 'predictions_all.json' + +Example for non-tarred datasets: + +python transcribe_speech_parallel.py \ + model=stt_en_conformer_ctc_large \ + predict_ds.manifest_filepath=/dataset/manifest_file.json \ + predict_ds.batch_size=16 \ + output_path=/tmp/ + +Example for Hybrid-CTC/RNNT models with non-tarred datasets: + +python transcribe_speech_parallel.py \ + model=stt_en_fastconformer_hybrid_large \ + decoder_type=ctc \ + predict_ds.manifest_filepath=/dataset/manifest_file.json \ + predict_ds.batch_size=16 \ + output_path=/tmp/ + +Example for tarred datasets: + +python transcribe_speech_parallel.py \ + predict_ds.is_tarred=true \ + predict_ds.manifest_filepath=/tarred_dataset/tarred_audio_manifest.json \ + predict_ds.tarred_audio_filepaths=/tarred_dataset/audio__OP_0..127_CL_.tar \ + ... + +By default the trainer uses all the GPUs available and default precision is FP32. +By setting the trainer config you may control these configs. For example to do the predictions with AMP on just two GPUs: + +python transcribe_speech_parallel.py \ + trainer.precision=16 \ + trainer.devices=2 \ + ... + +You may control the dataloader's config by setting the predict_ds: + +python transcribe_speech_parallel.py \ + predict_ds.num_workers=8 \ + predict_ds.min_duration=2.0 \ + predict_ds.sample_rate=16000 \ + model=stt_en_conformer_ctc_small \ + ... + +""" + + +import itertools +import json +import os +from dataclasses import dataclass, is_dataclass +from typing import Optional + +import pytorch_lightning as ptl +import torch +from omegaconf import MISSING, OmegaConf + +from nemo.collections.asr.data.audio_to_text_dataset import ASRPredictionWriter +from nemo.collections.asr.metrics.wer import word_error_rate +from nemo.collections.asr.models import ASRModel, EncDecHybridRNNTCTCModel +from nemo.collections.asr.models.configs.asr_models_config import ASRDatasetConfig +from nemo.collections.asr.parts.submodules.rnnt_decoding import RNNTDecodingConfig +from nemo.core.config import TrainerConfig, hydra_runner +from nemo.utils import logging +from nemo.utils.get_rank import is_global_rank_zero + + +@dataclass +class ParallelTranscriptionConfig: + model: Optional[str] = None # name + predict_ds: ASRDatasetConfig = ASRDatasetConfig(return_sample_id=True, num_workers=4) + output_path: str = MISSING + + # when return_predictions is enabled, the prediction call would keep all the predictions in memory and return them when prediction is done + return_predictions: bool = False + use_cer: bool = False + + # decoding strategy for RNNT models + rnnt_decoding: RNNTDecodingConfig = RNNTDecodingConfig() + + # decoder type: ctc or rnnt, can be used to switch between CTC and RNNT decoder for Hybrid RNNT/CTC models + decoder_type: Optional[str] = None + # att_context_size can be set for cache-aware streaming models with multiple look-aheads + att_context_size: Optional[list] = None + + trainer: TrainerConfig = TrainerConfig(devices=-1, accelerator="gpu", strategy="ddp") + + +def match_train_config(predict_ds, train_ds): + # It copies the important configurations from the train dataset of the model + # into the predict_ds to be used for prediction. It is needed to match the training configurations. + if train_ds is None: + return + + predict_ds.sample_rate = train_ds.get("sample_rate", 16000) + cfg_name_list = [ + "int_values", + "use_start_end_token", + "blank_index", + "unk_index", + "normalize", + "parser", + "eos_id", + "bos_id", + "pad_id", + ] + + if is_dataclass(predict_ds): + predict_ds = OmegaConf.structured(predict_ds) + for cfg_name in cfg_name_list: + if hasattr(train_ds, cfg_name): + setattr(predict_ds, cfg_name, getattr(train_ds, cfg_name)) + + return predict_ds + + +@hydra_runner(config_name="TranscriptionConfig", schema=ParallelTranscriptionConfig) +def main(cfg: ParallelTranscriptionConfig): + if cfg.model.endswith(".nemo"): + logging.info("Attempting to initialize from .nemo file") + model = ASRModel.restore_from(restore_path=cfg.model, map_location="cpu") + elif cfg.model.endswith(".ckpt"): + logging.info("Attempting to initialize from .ckpt file") + model = ASRModel.load_from_checkpoint(checkpoint_path=cfg.model, map_location="cpu") + else: + logging.info( + "Attempting to initialize from a pretrained model as the model name does not have the extension of .nemo or .ckpt" + ) + model = ASRModel.from_pretrained(model_name=cfg.model, map_location="cpu") + + if isinstance(model, EncDecHybridRNNTCTCModel) and cfg.decoder_type is not None: + model.change_decoding_strategy(decoder_type=cfg.decoder_type) + + trainer = ptl.Trainer(**cfg.trainer) + + cfg.predict_ds.return_sample_id = True + cfg.predict_ds = match_train_config(predict_ds=cfg.predict_ds, train_ds=model.cfg.train_ds) + data_loader = model._setup_dataloader_from_config(cfg.predict_ds) + + os.makedirs(cfg.output_path, exist_ok=True) + # trainer.global_rank is not valid before predict() is called. Need this hack to find the correct global_rank. + global_rank = trainer.node_rank * trainer.num_devices + int(os.environ.get("LOCAL_RANK", 0)) + output_file = os.path.join(cfg.output_path, f"predictions_{global_rank}.json") + predictor_writer = ASRPredictionWriter(dataset=data_loader.dataset, output_file=output_file) + trainer.callbacks.extend([predictor_writer]) + + predictions = trainer.predict(model=model, dataloaders=data_loader, return_predictions=cfg.return_predictions) + if predictions is not None: + predictions = list(itertools.chain.from_iterable(predictions)) + samples_num = predictor_writer.close_output_file() + + logging.info( + f"Prediction on rank {global_rank} is done for {samples_num} samples and results are stored in {output_file}." + ) + + if torch.distributed.is_initialized(): + torch.distributed.barrier() + + samples_num = 0 + pred_text_list = [] + text_list = [] + if is_global_rank_zero(): + output_file = os.path.join(cfg.output_path, f"predictions_all.json") + logging.info(f"Prediction files are being aggregated in {output_file}.") + with open(output_file, 'w') as outf: + for rank in range(trainer.world_size): + input_file = os.path.join(cfg.output_path, f"predictions_{rank}.json") + with open(input_file, 'r') as inpf: + lines = inpf.readlines() + for line in lines: + item = json.loads(line) + pred_text_list.append(item["pred_text"]) + text_list.append(item["text"]) + outf.write(json.dumps(item) + "\n") + samples_num += 1 + wer_cer = word_error_rate(hypotheses=pred_text_list, references=text_list, use_cer=cfg.use_cer) + logging.info( + f"Prediction is done for {samples_num} samples in total on all workers and results are aggregated in {output_file}." + ) + logging.info("{} for all predictions is {:.4f}.".format("CER" if cfg.use_cer else "WER", wer_cer)) + + +if __name__ == '__main__': + main() From f862b2a1c6e61b2fafd498e3983ac069edcb8ed5 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Tue, 19 Mar 2024 12:55:21 -0700 Subject: [PATCH 095/115] black Signed-off-by: Nikolay Karpov --- .../datasets/commoncrawl/__init__.py | 31 +++++++++++--- .../datasets/commoncrawl/commoncrawl.py | 38 +++++++++++++++++ .../datasets/commoncrawl/harv_utils.py | 41 ++++++++----------- 3 files changed, 80 insertions(+), 30 deletions(-) diff --git a/sdp/processors/datasets/commoncrawl/__init__.py b/sdp/processors/datasets/commoncrawl/__init__.py index b4fe3020..6a0de649 100644 --- a/sdp/processors/datasets/commoncrawl/__init__.py +++ b/sdp/processors/datasets/commoncrawl/__init__.py @@ -12,8 +12,29 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .commoncrawl import UseSonar, BLEUScore, Subprocess, NmtSubprocess, PreserveByValue, \ - Lang2Iso, SplitByVttSentence, SplitByVtt, AudioLid, TextLid, AllVttText, TxtToVtt, \ - ReadParquet, CreateInitialManifestCC, FfmpegConvert, ASR_HF, AlignerSubprocess, \ - SplitByAligner, JoinBy, EvalBandwidth, CreateInitialManifestExt, AudioDuration, \ - TrainDevTestSplitCC, DropAbsPath, GetSpecificFiles, CopyFiles, ManifestToUtf8 +from .commoncrawl import ( + ASR_HF, + AlignerSubprocess, + AllVttText, + AudioLid, + BLEUScore, + CopyFiles, + CreateInitialManifestCC, + CreateInitialManifestExt, + DropAbsPath, + EvalBandwidth, + GetSpecificFiles, + JoinBy, + Lang2Iso, + ManifestToUtf8, + NmtSubprocess, + ReadParquet, + SplitByAligner, + SplitByVtt, + SplitByVttSentence, + Subprocess, + TextLid, + TrainDevTestSplitCC, + TxtToVtt, + UseSonar, +) diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py index 77a9ddaa..7cc15e4a 100644 --- a/sdp/processors/datasets/commoncrawl/commoncrawl.py +++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py @@ -27,6 +27,7 @@ load_manifest, make_trans_list, read_jsonl, + split_by_vtt, split_by_vtt_new, text2lid, txt2vtt, @@ -1110,6 +1111,43 @@ def process_dataset_entry(self, data_entry): return [DataEntry(data=data_entry)] +class SplitByVtt(BaseParallelProcessor): + def __init__( + self, + source_audio_key: str, + caption_file_key: str, + duration_key: str = "duration", + output_text_key: str = "orig_text", + **kwargs, + ): + super().__init__(**kwargs) + self.source_audio_key = source_audio_key + self.duration_key = duration_key + self.output_text_key = output_text_key + self.caption_file_key = caption_file_key + + def prepare(self): + os.makedirs(self.splited_audio_dir, exist_ok=True) + + def process_dataset_entry(self, data_entry): + caption_file = data_entry[self.caption_file_key] + source_audio = data_entry[self.source_audio_key] + res_list = [] + + if os.path.isfile(source_audio): + data, samplerate = sf.read(source_audio) + text_list, start_s, end_s = split_by_vtt(caption_file, samplerate) + if text_list: + for segment_id, orig_text, start_time, end_time in enumerate(zip(text_list, start_s, end_s)): + data_entry["segment_id"] = segment_id + data_entry[self.output_text_key] = orig_text + data_entry["start_time"] = start_time + data_entry["end_time"] = end_time + + # self.makeDataEntry(data_entry, data, caption_file, samplerate, text, start_sr, end_sr) + return res_list + + class SplitByVttSentence(BaseParallelProcessor): """ A class for splitting audio files based on VTT (WebVTT) sentence-level segmentation in a dataset. diff --git a/sdp/processors/datasets/commoncrawl/harv_utils.py b/sdp/processors/datasets/commoncrawl/harv_utils.py index 9c9ae837..1b5767da 100644 --- a/sdp/processors/datasets/commoncrawl/harv_utils.py +++ b/sdp/processors/datasets/commoncrawl/harv_utils.py @@ -97,16 +97,20 @@ def parse_hours(inp): return datetime.strptime(inp, '%H:%M:%S.%f') -def split_by_vtt(vtt_file, wav_file, wav_save_path): +def split_by_vtt(vtt_file, samplerate): try: - data, samplerate = sf.read(wav_file) - target_sr = samplerate - if len(data.shape) > 1: - data = np.mean(data, axis=1) _begin = datetime.strptime('00:00:00.000', '%H:%M:%S.%f') - rel_vtt_file = '/'.join(os.path.splitext(vtt_file)[0].split('/')[-2:]) - wav_list, text_list, dur_list = [], [], [] - for caption in webvtt.read(vtt_file): + text_list, start_s, end_s = [], [], [] + if os.path.splitext(vtt_file)[1] == '.vtt': + webvtt_i = webvtt.read + elif os.path.splitext(vtt_file)[1] == '.srt': + webvtt_i = webvtt.from_srt + else: + raise ValueError("Unsupporte extention of file " + vtt_file) + + for caption in webvtt_i(vtt_file): + text = ' '.join(caption.text.split('\n')) + _start = parse_hours(caption.start) start = (_start - _begin).total_seconds() start_sr = int(start * samplerate) @@ -115,23 +119,10 @@ def split_by_vtt(vtt_file, wav_file, wav_save_path): end = (_end - _begin).total_seconds() end_sr = int(end * samplerate) - text = ' '.join(caption.text.split('\n')) - - wav_save_file = os.path.join( - wav_save_path, rel_vtt_file, str(int(start * 1000)) + "-" + str(int(end * 1000)) + ".wav" - ) - os.makedirs(os.path.split(wav_save_file)[0], exist_ok=True) - - # number_of_samples = round(len(data[start_sr:end_sr]) * float(target_sr) / samplerate) - # if number_of_samples > 0: - # if not os.path.exists(wav_save_file): - # data_sample = sps.resample(data[start_sr:end_sr], number_of_samples) - data_sample = data[start_sr:end_sr] - sf.write(wav_save_file, data_sample, target_sr) - text_list.append(text) - wav_list.append(wav_save_file) - dur_list.append(data_sample.shape[0] / samplerate) # (_end-_start).total_seconds() - return wav_list, text_list, dur_list + text_list.append(text.strip()) + start_s.append(start) + end_s.append(end) + return text_list, start_s, end_s except Exception as e: logger.warning(str(e) + vtt_file) return None, None, None From e2fe178bb49fccea4c788b759a768a566bc31232 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Tue, 19 Mar 2024 22:40:12 -0700 Subject: [PATCH 096/115] black Signed-off-by: Nikolay Karpov --- .../datasets/commoncrawl/__init__.py | 1 - .../datasets/commoncrawl/commoncrawl.py | 69 +++---------------- .../datasets/commoncrawl/harv_utils.py | 4 +- .../datasets/youtube/aggregate_segments.py | 61 ++++++++++------ sdp/processors/datasets/youtube/utils.py | 54 +++++++++------ 5 files changed, 81 insertions(+), 108 deletions(-) diff --git a/sdp/processors/datasets/commoncrawl/__init__.py b/sdp/processors/datasets/commoncrawl/__init__.py index 6a0de649..815a5549 100644 --- a/sdp/processors/datasets/commoncrawl/__init__.py +++ b/sdp/processors/datasets/commoncrawl/__init__.py @@ -20,7 +20,6 @@ BLEUScore, CopyFiles, CreateInitialManifestCC, - CreateInitialManifestExt, DropAbsPath, EvalBandwidth, GetSpecificFiles, diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py index c0c4a749..35b0385c 100644 --- a/sdp/processors/datasets/commoncrawl/commoncrawl.py +++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py @@ -33,6 +33,7 @@ txt2vtt, write_jsonl, ) +from sdp.processors.datasets.youtube.utils import Sample, parse_srt class ManifestToUtf8(BaseProcessor): @@ -1126,26 +1127,16 @@ def __init__( self.output_text_key = output_text_key self.caption_file_key = caption_file_key - def prepare(self): - os.makedirs(self.splited_audio_dir, exist_ok=True) - def process_dataset_entry(self, data_entry): caption_file = data_entry[self.caption_file_key] - source_audio = data_entry[self.source_audio_key] - res_list = [] - - if os.path.isfile(source_audio): - data, samplerate = sf.read(source_audio) - text_list, start_s, end_s = split_by_vtt(caption_file, samplerate) - if text_list: - for segment_id, orig_text, start_time, end_time in enumerate(zip(text_list, start_s, end_s)): - data_entry["segment_id"] = segment_id - data_entry[self.output_text_key] = orig_text - data_entry["start_time"] = start_time - data_entry["end_time"] = end_time + audio_file = data_entry[self.source_audio_key] + if not os.path.exists(audio_file): + return [] + segments = parse_srt(caption_file, verify_duration=True, wav_filepath=audio_file) - # self.makeDataEntry(data_entry, data, caption_file, samplerate, text, start_sr, end_sr) - return res_list + if len(segments) > 0: + data_entry['segments'] = [segment.__dict__ for segment in segments] + return [DataEntry(data=data_entry)] class SplitByVttSentence(BaseParallelProcessor): @@ -1575,47 +1566,3 @@ def process_dataset_entry(self, data_entry): data = {self.video_key: video, self.id_key: key, self.text_key: text} return [DataEntry(data=data)] - - -class CreateInitialManifestExt(BaseParallelProcessor): - """ - A class for creating an initial dataset manifest from audio files with a specified extension. - - Args: - raw_data_dir (str): The directory containing audio files to include in the initial dataset manifest. - output_field (str, optional): The field to store the audio file paths in the dataset. Defaults to "audio_filepath". - extention (str, optional): The file extension of the audio files to include in the manifest. Defaults to "mp3". - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. - - Methods: - prepare(): Creates the directory for saving the initial dataset manifest. - read_manifest(): Reads the audio files with the specified extension and creates a DataFrame with the specified output field. - process_dataset_entry(data_entry): Processes a single dataset entry, creating a DataEntry object with the audio file path, and updates the dataset. - - """ - - def __init__( - self, - raw_data_dir: str, - output_field: str = "audio_filepath", - extention: str = "mp3", - **kwargs, - ): - super().__init__(**kwargs) - self.raw_data_dir = Path(raw_data_dir) - self.output_field = output_field - self.extention = extention - - def prepare(self): - os.makedirs(self.raw_data_dir, exist_ok=True) - - def read_manifest(self): - input_files = [str(self.raw_data_dir / video) for video in self.raw_data_dir.rglob('*.' + self.extention)] - v_df = pd.DataFrame({self.output_field: input_files}) - return v_df.values - - def process_dataset_entry(self, data_entry): - (inputf) = data_entry - - data = {self.output_field: inputf[0]} - return [DataEntry(data=data)] diff --git a/sdp/processors/datasets/commoncrawl/harv_utils.py b/sdp/processors/datasets/commoncrawl/harv_utils.py index 1b5767da..41d591b0 100644 --- a/sdp/processors/datasets/commoncrawl/harv_utils.py +++ b/sdp/processors/datasets/commoncrawl/harv_utils.py @@ -97,7 +97,7 @@ def parse_hours(inp): return datetime.strptime(inp, '%H:%M:%S.%f') -def split_by_vtt(vtt_file, samplerate): +def split_by_vtt(vtt_file): try: _begin = datetime.strptime('00:00:00.000', '%H:%M:%S.%f') text_list, start_s, end_s = [], [], [] @@ -113,11 +113,9 @@ def split_by_vtt(vtt_file, samplerate): _start = parse_hours(caption.start) start = (_start - _begin).total_seconds() - start_sr = int(start * samplerate) _end = parse_hours(caption.end) end = (_end - _begin).total_seconds() - end_sr = int(end * samplerate) text_list.append(text.strip()) start_s.append(start) diff --git a/sdp/processors/datasets/youtube/aggregate_segments.py b/sdp/processors/datasets/youtube/aggregate_segments.py index d97524c4..c364ad94 100644 --- a/sdp/processors/datasets/youtube/aggregate_segments.py +++ b/sdp/processors/datasets/youtube/aggregate_segments.py @@ -9,16 +9,23 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from pydub import AudioSegment import os +from pydub import AudioSegment + from sdp.processors.base_processor import BaseParallelProcessor -from sdp.processors.datasets.youtube.utils import RawSegment, AggregatedSegment, get_audio_segment +from sdp.processors.datasets.youtube.utils import ( + AggregatedSegment, + RawSegment, + get_audio_segment, +) class AggregateSegments(BaseParallelProcessor): def __init__( self, + source_audio_key: str = "audio_filepath", + splited_audio_key: str = "audio_filepath", max_duration: float = 40.0, crop_audio_segments: bool = True, output_segments_audio_dir: str = None, @@ -26,13 +33,15 @@ def __init__( ): super().__init__(**kwargs) self.max_duration = max_duration + self.source_audio_key = source_audio_key + self.splited_audio_key = splited_audio_key self.crop_audio_segments = crop_audio_segments self.output_segments_audio_dir = output_segments_audio_dir - + def prepare(self): if self.crop_audio_segments and self.output_segments_audio_dir: os.makedirs(os.path.join(self.output_segments_audio_dir), exist_ok=True) - + def process_dataset_entry(self, data_entry: dict): sample_id = data_entry['sample_id'] segments = data_entry['segments'] @@ -42,30 +51,38 @@ def process_dataset_entry(self, data_entry: dict): return agg_segments first_segment = RawSegment(**segments[0]) - agg_segment = AggregatedSegment(segment=first_segment, segment_id=1, sample_id=sample_id, - output_audio_dir = self.output_segments_audio_dir) + agg_segment = AggregatedSegment( + segment=first_segment, segment_id=1, sample_id=sample_id, output_audio_dir=self.output_segments_audio_dir + ) - for segment in segments[1 : ]: + for segment in segments[1:]: segment = RawSegment(**segment) - - if (not agg_segment.duration_match or - agg_segment.duration >= self.max_duration or - segment.end_time - agg_segment.start_time >= self.max_duration): + + if ( + not agg_segment.duration_match + or agg_segment.duration >= self.max_duration + or segment.end_time - agg_segment.start_time >= self.max_duration + ): agg_segments.append(agg_segment.to_dataentry()) - agg_segment = AggregatedSegment(segment=segment, - segment_id=len(agg_segments) + 1, sample_id=sample_id, - output_audio_dir = self.output_segments_audio_dir) + agg_segment = AggregatedSegment( + segment=segment, + segment_id=len(agg_segments) + 1, + sample_id=sample_id, + output_audio_dir=self.output_segments_audio_dir, + ) else: agg_segment.aggregate(segment) else: agg_segments.append(agg_segment.to_dataentry()) - + if self.crop_audio_segments: - audio = AudioSegment.from_wav(data_entry['audio_filepath']) + audio = AudioSegment.from_wav(data_entry[self.source_audio_key]) for agg_segment in agg_segments: - get_audio_segment(audio = audio, - start_time = agg_segment.data['start_time'], - end_time = agg_segment.data['end_time'], - output_audio_filepath = agg_segment.data['audio_filepath']) - - return agg_segments \ No newline at end of file + get_audio_segment( + audio=audio, + start_time=agg_segment.data['start_time'], + end_time=agg_segment.data['end_time'], + output_audio_filepath=agg_segment.data[self.splited_audio_key], + ) + + return agg_segments diff --git a/sdp/processors/datasets/youtube/utils.py b/sdp/processors/datasets/youtube/utils.py index 9f5c9c5e..ec179f73 100644 --- a/sdp/processors/datasets/youtube/utils.py +++ b/sdp/processors/datasets/youtube/utils.py @@ -12,11 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os +import re +from dataclasses import dataclass + import pysrt from pydub import AudioSegment -from dataclasses import dataclass -import re -import os + from sdp.processors.base_processor import DataEntry @@ -28,22 +30,28 @@ class RawSegment: duration: str = None duration_match: bool = None orig_text: str = None + audio_lang: str = None + text_lang: str = None + source_audio: str = None def to_dataentry(self): - return DataEntry(data = self.__dict__) + return DataEntry(data=self.__dict__) class AggregatedSegment(RawSegment): def __init__(self, segment: dict, segment_id: int, sample_id: str, output_audio_dir: str): super().__init__(**segment.__dict__) self.segment_id = f"{sample_id}_{str(segment_id).zfill(4)}" - self.audio_filepath = os.path.join(output_audio_dir, f'{self.segment_id}.wav') if output_audio_dir is not None else None - + self.audio_filepath = ( + os.path.join(output_audio_dir, f'{self.segment_id}.wav') if output_audio_dir is not None else None + ) + def aggregate(self, segment): self.end_time = segment.end_time self.duration = self.end_time - self.start_time self.orig_text = re.sub("\s+", " ", f"{self.orig_text} {segment.orig_text}".strip()) + @dataclass class Sample: sample_id: str = None @@ -53,16 +61,18 @@ class Sample: segments: list[RawSegment | AggregatedSegment] = None def to_dataentry(self): - data = self.__dict__ - data['segments'] = [segment.data.__dict__ for segment in data['segments']] if data['segments'] is not None else [] - return DataEntry(data = data) - + data = self.__dict__ + data['segments'] = ( + [segment.data.__dict__ for segment in data['segments']] if data['segments'] is not None else [] + ) + return DataEntry(data=data) + def get_audio_segment(audio, start_time: float, end_time: float, output_audio_filepath: str = None): start_time = start_time * 1000 end_time = end_time * 1000 - audio_segment = audio[start_time : end_time] - + audio_segment = audio[start_time:end_time] + if output_audio_filepath: audio_segment.export(output_audio_filepath, format="wav") return audio_segment @@ -85,19 +95,21 @@ def parse_srt(srt_filepath, verify_duration: bool = True, wav_filepath: str = No epsilon = 1e-2 for sub in subs: - segment = RawSegment(segment_id = sub.index, - start_time = sub.start.ordinal / 1000, - end_time = sub.end.ordinal / 1000, - orig_text = sub.text_without_tags) - + segment = RawSegment( + segment_id=sub.index, + start_time=sub.start.ordinal / 1000, + end_time=sub.end.ordinal / 1000, + orig_text=sub.text_without_tags, + ) + duration_by_timestemps = segment.end_time - segment.start_time if audio: segment.duration = get_audio_segment_duration(audio, segment.start_time, segment.end_time) - segment.duration_match = abs(segment.duration - duration_by_timestemps) < epsilon - else: + segment.duration_match = abs(segment.duration - duration_by_timestemps) < epsilon + else: segment.duration = duration_by_timestemps srt_segments.append(segment) - - return srt_segments \ No newline at end of file + + return srt_segments From 8f99da09ce696b14cfb6a3e104e85eea3dc42a3c Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Wed, 20 Mar 2024 00:44:34 -0700 Subject: [PATCH 097/115] proxy Signed-off-by: Nikolay Karpov --- .../datasets/youtube/aggregate_segments.py | 11 ++++++++++- sdp/processors/datasets/youtube/utils.py | 14 +++++++++++++- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/sdp/processors/datasets/youtube/aggregate_segments.py b/sdp/processors/datasets/youtube/aggregate_segments.py index c364ad94..64927091 100644 --- a/sdp/processors/datasets/youtube/aggregate_segments.py +++ b/sdp/processors/datasets/youtube/aggregate_segments.py @@ -52,7 +52,13 @@ def process_dataset_entry(self, data_entry: dict): first_segment = RawSegment(**segments[0]) agg_segment = AggregatedSegment( - segment=first_segment, segment_id=1, sample_id=sample_id, output_audio_dir=self.output_segments_audio_dir + segment=first_segment, + segment_id=1, + sample_id=sample_id, + output_audio_dir=self.output_segments_audio_dir, + audio_lang=data_entry['audio_lang'], + text_lang=data_entry['text_lang'], + source_audio=data_entry[self.source_audio_key], ) for segment in segments[1:]: @@ -68,6 +74,9 @@ def process_dataset_entry(self, data_entry: dict): segment=segment, segment_id=len(agg_segments) + 1, sample_id=sample_id, + audio_lang=data_entry['audio_lang'], + text_lang=data_entry['text_lang'], + source_audio=data_entry[self.source_audio_key], output_audio_dir=self.output_segments_audio_dir, ) else: diff --git a/sdp/processors/datasets/youtube/utils.py b/sdp/processors/datasets/youtube/utils.py index ec179f73..48483221 100644 --- a/sdp/processors/datasets/youtube/utils.py +++ b/sdp/processors/datasets/youtube/utils.py @@ -39,9 +39,21 @@ def to_dataentry(self): class AggregatedSegment(RawSegment): - def __init__(self, segment: dict, segment_id: int, sample_id: str, output_audio_dir: str): + def __init__( + self, + segment: dict, + segment_id: int, + sample_id: str, + output_audio_dir: str, + audio_lang: str, + text_lang: str, + source_audio: str, + ): super().__init__(**segment.__dict__) self.segment_id = f"{sample_id}_{str(segment_id).zfill(4)}" + self.audio_lang = audio_lang + self.text_lang = text_lang + self.source_audio = source_audio self.audio_filepath = ( os.path.join(output_audio_dir, f'{self.segment_id}.wav') if output_audio_dir is not None else None ) From df15c3314c8f2eb7425e25e1a59a9cc59cef8176 Mon Sep 17 00:00:00 2001 From: Sasha Meister <117230141+ssh-meister@users.noreply.github.com> Date: Thu, 21 Mar 2024 06:30:11 +0100 Subject: [PATCH 098/115] New processors for calculating metrics (#50) * YouTube German config and new processors Signed-off-by: Sasha Meister * Added Merge Manifests processor Signed-off-by: Sasha Meister * Clean de.yaml pipeline config Signed-off-by: Sasha Meister * Fix Lang2Iso Signed-off-by: Sasha Meister * fix typo * fix empty list error - IndexError: list index out of range * Added requirements.txt Signed-off-by: Sasha Meister * Fixed paths for audio TN Signed-off-by: Sasha Meister * Updated requirements.txt Signed-off-by: Sasha Meister * ew processors for calculating metrics WER, CER, eedge CER, len diff ratio Signed-off-by: Sasha Meister * Update utils.py * Update aggregate_segments.py * Update aggregate_segments.py * Update aggregate_segments.py --------- Signed-off-by: Sasha Meister --- sdp/processors/__init__.py | 4 + .../datasets/commoncrawl/__init__.py | 2 +- .../modify_manifest/data_to_data.py | 356 ++++++++++++++++++ 3 files changed, 361 insertions(+), 1 deletion(-) diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py index 2ab441c5..58cae45b 100644 --- a/sdp/processors/__init__.py +++ b/sdp/processors/__init__.py @@ -54,6 +54,10 @@ SubIfASRSubstitution, SubMakeLowercase, SubRegex, + GetWER, + GetCER, + GetEdgeCER, + GetLenDiffRatio, ) from sdp.processors.modify_manifest.data_to_dropbool import ( DropASRError, diff --git a/sdp/processors/datasets/commoncrawl/__init__.py b/sdp/processors/datasets/commoncrawl/__init__.py index 815a5549..e20ef3b2 100644 --- a/sdp/processors/datasets/commoncrawl/__init__.py +++ b/sdp/processors/datasets/commoncrawl/__init__.py @@ -36,4 +36,4 @@ TrainDevTestSplitCC, TxtToVtt, UseSonar, -) +) \ No newline at end of file diff --git a/sdp/processors/modify_manifest/data_to_data.py b/sdp/processors/modify_manifest/data_to_data.py index dd09f8dc..762dc37f 100644 --- a/sdp/processors/modify_manifest/data_to_data.py +++ b/sdp/processors/modify_manifest/data_to_data.py @@ -16,6 +16,12 @@ import os import re from typing import Dict, List +import jiwer +import editdistance +import itertools +from tqdm.contrib.concurrent import process_map +from tqdm import tqdm +import json import soundfile as sf @@ -525,3 +531,353 @@ def finalize(self, metrics): for word, count in total_counter_sorted.items(): logger.info(f"{word} {count}") super().finalize(metrics) + +class GetWER(BaseParallelProcessor): + """ + Processor that computes the Word Error Rate (WER) between reference text and hypothesis text. + The WER is computed as the Levenshtein distance between the two texts normalized by the + number of words in the reference text. + + Args: + reference_text_field (str): Key to get the reference text from the data. + hypothesis_text_field (str): Key to get the hypothesis text from the data. + output_metric_field (str): Key to put the computed WER value. + + Returns: + All the same fields as in the input manifest plus the output_metric_field containing + the computed WER value. + """ + + def __init__( + self, + reference_text_field: str = "text", + hypothesis_text_field: str = "pred_text", + output_metric_field: str = "wer", + **kwargs, + ): + super().__init__(**kwargs) + self.reference_text_field = reference_text_field + self.hypothesis_text_field = hypothesis_text_field + self.output_metric_field = output_metric_field + self.word_dist = 0 + self.num_words = 0 + + def process(self): + self.prepare() + os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True) + metrics = [] + + with open(self.output_manifest_file, "wt", encoding="utf8") as fout: + for manifest_chunk in self._chunk_manifest(): + # this will unroll all inner lists + data = itertools.chain( + *process_map( + self.process_dataset_entry, + manifest_chunk, + max_workers=self.max_workers, + chunksize=self.chunksize, + ) + ) + for data_entry in tqdm(data): + metrics.append(data_entry.metrics) + if data_entry.data is None: + continue + json.dump(data_entry.data, fout, ensure_ascii=False) + self.number_of_entries += 1 + self.total_duration += data_entry.data.get("duration", 0) + self.word_dist += data_entry.metrics.get("word_dist", 0) + self.num_words += data_entry.metrics.get("num_words", 0) + fout.write("\n") + + self.finalize(metrics) + + def process_dataset_entry(self, data_entry): + reference_text = data_entry[self.reference_text_field] + hypothesis_text = data_entry[self.hypothesis_text_field] + + ref_words_amount = len(reference_text.split()) + hyp_words_amount = len(hypothesis_text.split()) + + if ref_words_amount == 0 or hyp_words_amount == 0: + if ref_words_amount == hyp_words_amount: + word_dist = 0 + else: + word_dist = ref_words_amount + else: + word_dist_measures = jiwer.compute_measures(reference_text, hypothesis_text) + word_dist = word_dist_measures['substitutions'] + word_dist_measures['insertions'] + word_dist_measures['deletions'] + + wer_value = word_dist / ref_words_amount + data_entry[self.output_metric_field] = round(wer_value * 100, 2) + + return [DataEntry(data=data_entry, metrics = {'word_dist' : word_dist, 'num_words' : ref_words_amount})] + + def finalize(self, metrics: List): + logger.info("Total number of entries after processing: %d", self.number_of_entries) + if self.total_duration != 0: + logger.info("Total audio duration (hours) after processing: %.2f", self.total_duration / 3600) + + logger.info("Overall Word Error Rate (WER): %.2f%%", self.word_dist / self.num_words * 100) + + +class GetCER(BaseParallelProcessor): + """ + Processor that computes the Character Error Rate (CER) between reference text and hypothesis text. + The CER is computed as the Levenshtein distance between the two texts normalized by the + number of characters in the reference text. + + Args: + reference_text_field (str): Key to get the reference text from the data. + hypothesis_text_field (str): Key to get the hypothesis text from the data. + output_metric_field (str): Key to put the computed CER value. + + Returns: + All the same fields as in the input manifest plus the output_metric_field containing + the computed CER value. + """ + + def __init__( + self, + reference_text_field: str = "text", + hypothesis_text_field: str = "pred_text", + output_metric_field: str = "cer", + **kwargs, + ): + super().__init__(**kwargs) + self.reference_text_field = reference_text_field + self.hypothesis_text_field = hypothesis_text_field + self.output_metric_field = output_metric_field + self.char_dist = 0 + self.num_chars = 0 + + def process(self): + self.prepare() + os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True) + metrics = [] + + with open(self.output_manifest_file, "wt", encoding="utf8") as fout: + for manifest_chunk in self._chunk_manifest(): + # this will unroll all inner lists + data = itertools.chain( + *process_map( + self.process_dataset_entry, + manifest_chunk, + max_workers=self.max_workers, + chunksize=self.chunksize, + ) + ) + for data_entry in tqdm(data): + metrics.append(data_entry.metrics) + if data_entry.data is None: + continue + json.dump(data_entry.data, fout, ensure_ascii=False) + self.number_of_entries += 1 + self.total_duration += data_entry.data.get("duration", 0) + self.char_dist += data_entry.metrics.get("char_dist", 0) + self.num_chars += data_entry.metrics.get("num_chars", 0) + fout.write("\n") + + self.finalize(metrics) + + def process_dataset_entry(self, data_entry): + reference_text = data_entry[self.reference_text_field] + hypothesis_text = data_entry[self.hypothesis_text_field] + + ref_chars_amount = len(reference_text) + hyp_chars_amount = len(hypothesis_text) + + if ref_chars_amount == 0 or hyp_chars_amount == 0: + if ref_chars_amount == hyp_chars_amount: + char_dist = 0 + else: + char_dist = ref_chars_amount + else: + char_dist = editdistance.eval(reference_text, hypothesis_text) + + cer_value = char_dist / ref_chars_amount + data_entry[self.output_metric_field] = round(cer_value * 100, 2) + + return [DataEntry(data=data_entry, metrics = {'char_dist' : char_dist, 'num_chars' : ref_chars_amount})] + + def finalize(self, metrics: List): + logger.info("Total number of entries after processing: %d", self.number_of_entries) + if self.total_duration != 0: + logger.info("Total audio duration (hours) after processing: %.2f", self.total_duration / 3600) + + logger.info("Overall Character Error Rate (CER): %.2f%%", self.char_dist / self.num_chars * 100) + + +class GetEdgeCER(BaseParallelProcessor): + """ + Processor that computes the Character Error Rate (CER) for a specified edge of reference + and hypothesis texts. + + Args: + reference_text_field (str): Key to get the reference text from the data. + hypothesis_text_field (str): Key to get the hypothesis text from the data. + edge (str): Specifies whether to compute CER for the 'start' or 'end' edge of the texts. + edge_len (int): Length of the edge window. + output_metric_field (str): Key to put the computed edge CER value. + + Returns: + All the same fields as in the input manifest plus the output_metric_field containing + the computed edge CER value. + """ + + def __init__( + self, + reference_text_field: str = "text", + hypothesis_text_field: str = "pred_text", + edge: str = "start", + edge_len: int = 10, + output_metric_field: str = "start_cer", + **kwargs, + ): + super().__init__(**kwargs) + self.reference_text_field = reference_text_field + self.hypothesis_text_field = hypothesis_text_field + self.edge = edge + self.edge_len = edge_len + self.output_metric_field = output_metric_field + self.edge_cer_sum = 0 + + def process(self): + self.prepare() + os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True) + metrics = [] + + with open(self.output_manifest_file, "wt", encoding="utf8") as fout: + for manifest_chunk in self._chunk_manifest(): + # this will unroll all inner lists + data = itertools.chain( + *process_map( + self.process_dataset_entry, + manifest_chunk, + max_workers=self.max_workers, + chunksize=self.chunksize, + ) + ) + for data_entry in tqdm(data): + metrics.append(data_entry.metrics) + if data_entry.data is None: + continue + json.dump(data_entry.data, fout, ensure_ascii=False) + self.number_of_entries += 1 + self.total_duration += data_entry.data.get("duration", 0) + self.edge_cer_sum += data_entry.data.get(self.output_metric_field, 0) + fout.write("\n") + + self.finalize(metrics) + + def process_dataset_entry(self, data_entry): + if self.edge == "start": + start_idx = 0 + end_idx = self.edge_len + elif self.edge == "end": + start_idx = -self.edge_len + end_idx = -1 + else: + raise ValueError(f"Current `Edge` parameter value ({self.edge}) is incorrect. Please select `start` or `end` edge.") + + reference_text_edge = data_entry[self.reference_text_field][start_idx : end_idx] + hypothesis_text_edge = data_entry[self.hypothesis_text_field][start_idx : end_idx] + + ref_chars_amount = len(reference_text_edge) + hyp_chars_amount = len(hypothesis_text_edge) + + if ref_chars_amount == 0 or hyp_chars_amount == 0: + if ref_chars_amount == hyp_chars_amount: + char_dist = 0 + else: + char_dist = ref_chars_amount + else: + char_dist = editdistance.eval(reference_text_edge, hypothesis_text_edge) + + edge_cer_value = char_dist / ref_chars_amount + data_entry[self.output_metric_field] = round(edge_cer_value * 100, 2) + + return [DataEntry(data=data_entry)] + + def finalize(self, metrics: List): + logger.info("Total number of entries after processing: %d", self.number_of_entries) + if self.total_duration != 0: + logger.info("Total audio duration (hours) after processing: %.2f", self.total_duration / 3600) + + logger.info(f"Mean {self.edge} Character Error Rate (CER): {round(self.edge_cer_sum / self.number_of_entries, 2)}%") + + +class GetLenDiffRatio(BaseParallelProcessor): + """ + Processor that computes the length difference ratio between reference and hypothesis texts. + + Args: + reference_text_field (str): Key to get the reference text from the data. + hypothesis_text_field (str): Key to get the hypothesis text from the data. + output_metric_field (str): Key to put the computed length difference ratio. + + Returns: + All the same fields as in the input manifest plus the output_metric_field containing + the computed length difference ratio. + """ + + def __init__( + self, + reference_text_field: str = "text", + hypothesis_text_field: str = "pred_text", + output_metric_field: str = "len_diff_ratio", + **kwargs, + ): + super().__init__(**kwargs) + self.reference_text_field = reference_text_field + self.hypothesis_text_field = hypothesis_text_field + self.output_metric_field = output_metric_field + self.words_len_diff_ratio_sum = 0 + + def process(self): + self.prepare() + os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True) + metrics = [] + + with open(self.output_manifest_file, "wt", encoding="utf8") as fout: + for manifest_chunk in self._chunk_manifest(): + # this will unroll all inner lists + data = itertools.chain( + *process_map( + self.process_dataset_entry, + manifest_chunk, + max_workers=self.max_workers, + chunksize=self.chunksize, + ) + ) + for data_entry in tqdm(data): + metrics.append(data_entry.metrics) + if data_entry.data is None: + continue + json.dump(data_entry.data, fout, ensure_ascii=False) + self.number_of_entries += 1 + self.total_duration += data_entry.data.get("duration", 0) + self.words_len_diff_ratio_sum += data_entry.data.get(self.output_metric_field, 0) + fout.write("\n") + + self.finalize(metrics) + + def process_dataset_entry(self, data_entry): + reference_text = data_entry[self.reference_text_field] + hypothesis_text = data_entry[self.hypothesis_text_field] + + ref_words_amount = len(reference_text.split()) + hyp_words_amount = len(hypothesis_text.split()) + + eps = 1e-9 + len_diff_ratio = 1.0 * abs(ref_words_amount - hyp_words_amount) / max(ref_words_amount, eps) + + data_entry[self.output_metric_field] = round(len_diff_ratio * 100, 2) + + return [DataEntry(data=data_entry)] + + def finalize(self, metrics: List): + logger.info("Total number of entries after processing: %d", self.number_of_entries) + if self.total_duration != 0: + logger.info("Total audio duration (hours) after processing: %.2f", self.total_duration / 3600) + + logger.info(f"Mean Text Length Difference Ratio (in words): {round(self.words_len_diff_ratio_sum / self.number_of_entries, 2)}%") \ No newline at end of file From 3434b7caa46b7a2184ae626ae46de754b6f69a98 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Thu, 9 May 2024 09:45:16 -0700 Subject: [PATCH 099/115] beamsearch Signed-off-by: Nikolay Karpov --- sdp/processors/nemo/beamsearch_inference.py | 276 ++++++++++++++++++++ 1 file changed, 276 insertions(+) create mode 100644 sdp/processors/nemo/beamsearch_inference.py diff --git a/sdp/processors/nemo/beamsearch_inference.py b/sdp/processors/nemo/beamsearch_inference.py new file mode 100644 index 00000000..b183d11a --- /dev/null +++ b/sdp/processors/nemo/beamsearch_inference.py @@ -0,0 +1,276 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import contextlib +import Levenshtein +import json +import os +import re +from dataclasses import dataclass, field, is_dataclass +from pathlib import Path +from typing import Dict, List, Optional, Union + +import editdistance +import numpy as np +import torch +from omegaconf import MISSING, OmegaConf +from sklearn.model_selection import ParameterGrid +from tqdm.auto import tqdm + +import nemo.collections.asr as nemo_asr +from nemo.collections.asr.models import EncDecHybridRNNTCTCModel +from nemo.collections.asr.parts.submodules import ctc_beam_decoding +from nemo.collections.asr.parts.submodules.ctc_decoding import CTCDecodingConfig +from nemo.collections.asr.parts.utils.transcribe_utils import PunctuationCapitalization, TextProcessingConfig +from nemo.core.config import hydra_runner +from nemo.utils import logging + +from sdp.processors.base_processor import BaseProcessor, BaseParallelProcessor, DataEntry + + +def load_manifest(manifest: Path) -> List[Dict[str, Union[str, float]]]: + result = [] + with manifest.open() as f: + for i, line in enumerate(f): + data = json.loads(line) + result.append(data) + return result + +@dataclass +class EvalBeamSearchNGramConfig: + """ + Evaluate an ASR model with beam search decoding and n-gram KenLM language model. + """ + # # The path of the '.nemo' file of the ASR model or the name of a pretrained model (ngc / huggingface) + model_path: str = MISSING + + # File paths + dataset_manifest: str = MISSING # The manifest file of the evaluation set + preds_output_folder: Optional[str] = None # The optional folder where the predictions are stored + cache_file: Optional[str] = None # The cache file for storing the logprobs of the model + + # Parameters for inference + batch_size: int = 16 # The batch size to calculate log probabilities + beam_batch_size: int = 1 # The batch size to be used for beam search decoding + + # Set `cuda` to int to define CUDA device. If 'None', will look for CUDA + # device anyway, and do inference on CPU only if CUDA device is not found. + # If `cuda` is a negative number, inference will be on CPU only. + cuda: Optional[int] = None + allow_mps: bool = False # allow to select MPS device (Apple Silicon M-series GPU) + amp: bool = False + matmul_precision: str = "highest" # Literal["highest", "high", "medium"] + + # Beam Search hyperparameters + ctc_decoding: CTCDecodingConfig = field(default_factory=lambda: CTCDecodingConfig( + strategy="flashlight", # gready, beam = pyctcdecode, flashlight + beam = ctc_beam_decoding.BeamCTCInferConfig( + nemo_kenlm_path="/mnt/ssd4/ckpts/en/stt_en_fastconformer_hybrid_large_pc/mls_test_pc_lm.kenlm", + beam_size=4, + beam_alpha=0.5, # LM weight + beam_beta=0.5, # length weight + return_best_hypothesis = False, + flashlight_cfg=ctc_beam_decoding.FlashlightConfig( + lexicon_path = "/mnt/ssd4/ckpts/en/stt_en_fastconformer_hybrid_large_pc/mls_test_pc_lm.flashlight_lexicon"), + pyctcdecode_cfg=ctc_beam_decoding.PyCTCDecodeConfig(), + ), + )) + + text_processing: Optional[TextProcessingConfig] = field(default_factory=lambda: TextProcessingConfig( + punctuation_marks = ".,?", + separate_punctuation = False, + do_lowercase = False, + rm_punctuation = False, + )) + + +class BeamsearchTopNInference(BaseProcessor): + """Adds predictions of a text-based punctuation and capitalization (P&C) model. + + Operates on the text in the ``input_text_field``, and saves predictions in + the ``output_text_field``. + + Args: + input_audio_key (str): the text field that will be the input to the P&C model. + output_text_key (str): the text field where the output of the PC model + will be saved. + batch_size (int): the batch sized used by the P&C model. + device (str): the device used by the P&C model. Can be skipped to auto-select. + pretrained_name (str): the pretrained_name of the P&C model. + model_path (str): the model path to the P&C model. + + .. note:: + Either ``pretrained_name`` or ``model_path`` have to be specified. + + Returns: + The same data as in the input manifest with an additional field + containing P&C model's predictions. + """ + + def __init__( + self, + input_audio_key: str, + output_text_key: str, + batch_size: int, + device: Optional[str] = None, + pretrained_name: Optional[str] = None, + model_path: Optional[str] = None, + cfg: Optional[EvalBeamSearchNGramConfig] = EvalBeamSearchNGramConfig(), + **kwargs, + ): + super().__init__(**kwargs) + + self.pretrained_name = pretrained_name + self.model_path = model_path + self.input_audio_key = input_audio_key + self.output_text_key = output_text_key + self.device = device + self.batch_size = batch_size + self.cfg=cfg + + # verify self.pretrained_name/model_path + if self.pretrained_name is None and self.model_path is None: + raise ValueError("pretrained_name and model_path cannot both be None") + if self.pretrained_name is not None and self.model_path is not None: + raise ValueError("pretrained_name and model_path cannot both be specified") + + def process(self): + if self.pretrained_name: + model = EncDecHybridRNNTCTCModel.from_pretrained(self.pretrained_name) + else: + model = EncDecHybridRNNTCTCModel.restore_from(self.model_path) + + if self.device is None: + if torch.cuda.is_available(): + model = model.cuda() + else: + model = model.cpu() + else: + model = model.to(self.device) + + manifest = load_manifest(Path(self.input_manifest_file)) + audio_file_paths = [x[self.input_audio_key] for x in manifest] + + + if isinstance(model, EncDecHybridRNNTCTCModel): + model.change_decoding_strategy(decoding_cfg=None, decoder_type="ctc") + else: + model.change_decoding_strategy(None) + + # Override the beam search config with current search candidate configuration + model.cfg.decoding = CTCDecodingConfig( + strategy=self.cfg.ctc_decoding.strategy, + preserve_alignments=self.cfg.ctc_decoding.preserve_alignments, + compute_timestamps=self.cfg.ctc_decoding.compute_timestamps, + word_seperator=self.cfg.ctc_decoding.word_seperator, + ctc_timestamp_type=self.cfg.ctc_decoding.ctc_timestamp_type, + batch_dim_index=self.cfg.ctc_decoding.batch_dim_index, + greedy=self.cfg.ctc_decoding.greedy, + confidence_cfg=self.cfg.ctc_decoding.confidence_cfg, + temperature=self.cfg.ctc_decoding.temperature, + beam = ctc_beam_decoding.BeamCTCInferConfig(beam_size=self.cfg.ctc_decoding.beam.beam_size, + beam_alpha=self.cfg.ctc_decoding.beam.beam_alpha, + beam_beta=self.cfg.ctc_decoding.beam.beam_beta, + word_kenlm_path=self.cfg.ctc_decoding.beam.word_kenlm_path, + nemo_kenlm_path=self.cfg.ctc_decoding.beam.nemo_kenlm_path, + preserve_alignments=self.cfg.ctc_decoding.beam.preserve_alignments, + compute_timestamps=self.cfg.ctc_decoding.beam.compute_timestamps, + flashlight_cfg=self.cfg.ctc_decoding.beam.flashlight_cfg, + pyctcdecode_cfg=self.cfg.ctc_decoding.beam.pyctcdecode_cfg, + return_best_hypothesis=self.cfg.ctc_decoding.beam.return_best_hypothesis), + ) + # Update model's decoding strategy + if isinstance(model, EncDecHybridRNNTCTCModel): + model.change_decoding_strategy(model.cfg.decoding, decoder_type='ctc') + else: + model.change_decoding_strategy(model.cfg.decoding) + + + with torch.no_grad(): + if isinstance(model, EncDecHybridRNNTCTCModel): + model.cur_decoder = 'ctc' + + override_cfg = model.get_transcribe_config() + override_cfg.batch_size = self.batch_size + override_cfg.return_hypotheses = True + + all_hypotheses = model.transcribe(audio_file_paths, override_config=override_cfg) + if type(all_hypotheses) == tuple and len(all_hypotheses) == 2: # if transcriptions form a tuple of (best_hypotheses, all_hypotheses) + all_hypotheses = all_hypotheses[1] + + pred_texts = [] + for hypotheses in all_hypotheses: + pred_text = [hyp.text for hyp in hypotheses] + pred_texts.append(pred_text) + + Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True) + with Path(self.output_manifest_file).open('w') as f: + for item, t in zip(manifest, pred_texts): + item[self.output_text_key] = t + f.write(json.dumps(item, ensure_ascii=False) + '\n') + +class RestorePCbyTopN(BaseParallelProcessor): + """ + Adds predictions of a audio-based punctuation and capitalization (P&C) model. + + Args: + text_without_pc_key (str): Key to get path to wav file. + texts_with_pc_key (str): Key to put to audio duration. + output_text_key (str): Key to put to audio duration. + Returns: + All the same fields as in the input manifest plus duration_field + """ + + def __init__( + self, + text_without_pc_key: str, + texts_with_pc_key: str, + output_text_key: str, + punctuation: str, + do_lower: bool = True, + **kwargs, + ): + super().__init__(**kwargs) + self.text_without_pc_key = text_without_pc_key + self.texts_with_pc_key = texts_with_pc_key + self.output_text_key = output_text_key + self.punctuation = punctuation + self.do_lower = do_lower + + def prepare(self): + if self.punctuation: + self.patterns = re.compile("["+self.punctuation+"]") + + def process_dataset_entry(self, data_entry): + text_without_pc = data_entry[self.text_without_pc_key] + texts_with_pc = data_entry[self.texts_with_pc_key] + texts = [] + ldists = [] + for text in texts_with_pc: + if self.do_lower: + text = text.lower() + if self.punctuation: + text = self.patterns.sub('', text) + ldist = Levenshtein.distance(text, text_without_pc) + if ldist == 0: + data_entry[self.output_text_key] = text + return [DataEntry(data=data_entry)] + + ldists.append(ldist) + texts.append(text) + + data_entry[self.output_text_key] = texts[np.argmin(ldists)] + return [DataEntry(data=data_entry)] + \ No newline at end of file From 082d16816052833018b4e9c5d61a188a736fbdad Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Thu, 9 May 2024 09:48:37 -0700 Subject: [PATCH 100/115] yaml Signed-off-by: Nikolay Karpov --- dataset_configs/youtube/beamsearch.yaml | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 dataset_configs/youtube/beamsearch.yaml diff --git a/dataset_configs/youtube/beamsearch.yaml b/dataset_configs/youtube/beamsearch.yaml new file mode 100644 index 00000000..e913c43a --- /dev/null +++ b/dataset_configs/youtube/beamsearch.yaml @@ -0,0 +1,20 @@ +processors_to_run: "0:" +workspace_dir: ??? + +processors: + - _target_: sdp.processors.nemo.beamsearch_inference.BeamsearchTopNInference + input_manifest_file: ${workspace_dir}/mls_test_pc.json + output_manifest_file: ${workspace_dir}/tmp_manifest1.json + input_audio_key: audio_filepath + output_text_key: pred_texts + batch_size: 16 + model_path: /mnt/ssd4/ckpts/en/stt_en_fastconformer_hybrid_large_pc/stt_en_fastconformer_hybrid_large_pc.nemo + + + - _target_: sdp.processors.nemo.beamsearch_inference.RestorePCbyTopN + output_manifest_file: ${workspace_dir}/tmp_manifest2.json + text_without_pc_key: text + texts_with_pc_key: pred_texts + output_text_key: pred_text + punctuation: ",.?" + do_lower: true \ No newline at end of file From c6fe2a5883dea0927929346b3b8acf7b7e8855d8 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Tue, 14 May 2024 06:24:40 -0700 Subject: [PATCH 101/115] chunk_manifest Signed-off-by: Nikolay Karpov --- dataset_configs/youtube/beamsearch.yaml | 8 +- sdp/processors/nemo/beamsearch_inference.py | 153 ++++++++++++-------- 2 files changed, 97 insertions(+), 64 deletions(-) diff --git a/dataset_configs/youtube/beamsearch.yaml b/dataset_configs/youtube/beamsearch.yaml index e913c43a..8cbf1e72 100644 --- a/dataset_configs/youtube/beamsearch.yaml +++ b/dataset_configs/youtube/beamsearch.yaml @@ -3,12 +3,14 @@ workspace_dir: ??? processors: - _target_: sdp.processors.nemo.beamsearch_inference.BeamsearchTopNInference - input_manifest_file: ${workspace_dir}/mls_test_pc.json + in_memory_chunksize: 10000 + input_manifest_file: /mnt/md1/YTDS/ES/clean_data/es_manifest.json output_manifest_file: ${workspace_dir}/tmp_manifest1.json input_audio_key: audio_filepath output_text_key: pred_texts - batch_size: 16 - model_path: /mnt/ssd4/ckpts/en/stt_en_fastconformer_hybrid_large_pc/stt_en_fastconformer_hybrid_large_pc.nemo + batch_size: 64 + device: cuda + model_path: /mnt/md1/YTDS/ES/lm/stt_es_fastconformer_hybrid_large_pc.nemo - _target_: sdp.processors.nemo.beamsearch_inference.RestorePCbyTopN diff --git a/sdp/processors/nemo/beamsearch_inference.py b/sdp/processors/nemo/beamsearch_inference.py index b183d11a..4e63a7d7 100644 --- a/sdp/processors/nemo/beamsearch_inference.py +++ b/sdp/processors/nemo/beamsearch_inference.py @@ -77,13 +77,13 @@ class EvalBeamSearchNGramConfig: ctc_decoding: CTCDecodingConfig = field(default_factory=lambda: CTCDecodingConfig( strategy="flashlight", # gready, beam = pyctcdecode, flashlight beam = ctc_beam_decoding.BeamCTCInferConfig( - nemo_kenlm_path="/mnt/ssd4/ckpts/en/stt_en_fastconformer_hybrid_large_pc/mls_test_pc_lm.kenlm", + nemo_kenlm_path="/mnt/md1/YTDS/ES/lm/lm.kenlm", beam_size=4, beam_alpha=0.5, # LM weight beam_beta=0.5, # length weight return_best_hypothesis = False, flashlight_cfg=ctc_beam_decoding.FlashlightConfig( - lexicon_path = "/mnt/ssd4/ckpts/en/stt_en_fastconformer_hybrid_large_pc/mls_test_pc_lm.flashlight_lexicon"), + lexicon_path = "/mnt/md1/YTDS/ES/lm/lm.flashlight_lexicon"), pyctcdecode_cfg=ctc_beam_decoding.PyCTCDecodeConfig(), ), )) @@ -127,6 +127,7 @@ def __init__( device: Optional[str] = None, pretrained_name: Optional[str] = None, model_path: Optional[str] = None, + in_memory_chunksize: int = 100000, cfg: Optional[EvalBeamSearchNGramConfig] = EvalBeamSearchNGramConfig(), **kwargs, ): @@ -138,6 +139,7 @@ def __init__( self.output_text_key = output_text_key self.device = device self.batch_size = batch_size + self.in_memory_chunksize=in_memory_chunksize self.cfg=cfg # verify self.pretrained_name/model_path @@ -146,6 +148,32 @@ def __init__( if self.pretrained_name is not None and self.model_path is not None: raise ValueError("pretrained_name and model_path cannot both be specified") + def _chunk_manifest(self): + """Splits the manifest into smaller chunks defined by ``in_memory_chunksize``. + """ + manifest_chunk = [] + for idx, data_entry in enumerate(self.read_manifest(), 1): + manifest_chunk.append(data_entry) + if idx % self.in_memory_chunksize == 0: + yield manifest_chunk + manifest_chunk = [] + if len(manifest_chunk) > 0: + yield manifest_chunk + + def read_manifest(self): + """Reading the input manifest file. + + .. note:: + This function should be overridden in the "initial" class creating + manifest to read from the original source of data. + """ + if self.input_manifest_file is None: + raise NotImplementedError("Override this method if the processor creates initial manifest") + + with open(self.input_manifest_file, "rt", encoding="utf8") as fin: + for line in fin: + yield json.loads(line) + def process(self): if self.pretrained_name: model = EncDecHybridRNNTCTCModel.from_pretrained(self.pretrained_name) @@ -160,66 +188,69 @@ def process(self): else: model = model.to(self.device) - manifest = load_manifest(Path(self.input_manifest_file)) - audio_file_paths = [x[self.input_audio_key] for x in manifest] - - - if isinstance(model, EncDecHybridRNNTCTCModel): - model.change_decoding_strategy(decoding_cfg=None, decoder_type="ctc") - else: - model.change_decoding_strategy(None) - - # Override the beam search config with current search candidate configuration - model.cfg.decoding = CTCDecodingConfig( - strategy=self.cfg.ctc_decoding.strategy, - preserve_alignments=self.cfg.ctc_decoding.preserve_alignments, - compute_timestamps=self.cfg.ctc_decoding.compute_timestamps, - word_seperator=self.cfg.ctc_decoding.word_seperator, - ctc_timestamp_type=self.cfg.ctc_decoding.ctc_timestamp_type, - batch_dim_index=self.cfg.ctc_decoding.batch_dim_index, - greedy=self.cfg.ctc_decoding.greedy, - confidence_cfg=self.cfg.ctc_decoding.confidence_cfg, - temperature=self.cfg.ctc_decoding.temperature, - beam = ctc_beam_decoding.BeamCTCInferConfig(beam_size=self.cfg.ctc_decoding.beam.beam_size, - beam_alpha=self.cfg.ctc_decoding.beam.beam_alpha, - beam_beta=self.cfg.ctc_decoding.beam.beam_beta, - word_kenlm_path=self.cfg.ctc_decoding.beam.word_kenlm_path, - nemo_kenlm_path=self.cfg.ctc_decoding.beam.nemo_kenlm_path, - preserve_alignments=self.cfg.ctc_decoding.beam.preserve_alignments, - compute_timestamps=self.cfg.ctc_decoding.beam.compute_timestamps, - flashlight_cfg=self.cfg.ctc_decoding.beam.flashlight_cfg, - pyctcdecode_cfg=self.cfg.ctc_decoding.beam.pyctcdecode_cfg, - return_best_hypothesis=self.cfg.ctc_decoding.beam.return_best_hypothesis), - ) - # Update model's decoding strategy - if isinstance(model, EncDecHybridRNNTCTCModel): - model.change_decoding_strategy(model.cfg.decoding, decoder_type='ctc') - else: - model.change_decoding_strategy(model.cfg.decoding) - - - with torch.no_grad(): - if isinstance(model, EncDecHybridRNNTCTCModel): - model.cur_decoder = 'ctc' - - override_cfg = model.get_transcribe_config() - override_cfg.batch_size = self.batch_size - override_cfg.return_hypotheses = True - - all_hypotheses = model.transcribe(audio_file_paths, override_config=override_cfg) - if type(all_hypotheses) == tuple and len(all_hypotheses) == 2: # if transcriptions form a tuple of (best_hypotheses, all_hypotheses) - all_hypotheses = all_hypotheses[1] - - pred_texts = [] - for hypotheses in all_hypotheses: - pred_text = [hyp.text for hyp in hypotheses] - pred_texts.append(pred_text) - Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True) - with Path(self.output_manifest_file).open('w') as f: - for item, t in zip(manifest, pred_texts): - item[self.output_text_key] = t - f.write(json.dumps(item, ensure_ascii=False) + '\n') + with open(self.output_manifest_file, "wt", encoding="utf8") as fout: + + for manifest in self._chunk_manifest(): + + audio_file_paths = [x[self.input_audio_key] for x in manifest] + + + if isinstance(model, EncDecHybridRNNTCTCModel): + model.change_decoding_strategy(decoding_cfg=None, decoder_type="ctc") + else: + model.change_decoding_strategy(None) + + # Override the beam search config with current search candidate configuration + model.cfg.decoding = CTCDecodingConfig( + strategy=self.cfg.ctc_decoding.strategy, + preserve_alignments=self.cfg.ctc_decoding.preserve_alignments, + compute_timestamps=self.cfg.ctc_decoding.compute_timestamps, + word_seperator=self.cfg.ctc_decoding.word_seperator, + ctc_timestamp_type=self.cfg.ctc_decoding.ctc_timestamp_type, + batch_dim_index=self.cfg.ctc_decoding.batch_dim_index, + greedy=self.cfg.ctc_decoding.greedy, + confidence_cfg=self.cfg.ctc_decoding.confidence_cfg, + temperature=self.cfg.ctc_decoding.temperature, + beam = ctc_beam_decoding.BeamCTCInferConfig(beam_size=self.cfg.ctc_decoding.beam.beam_size, + beam_alpha=self.cfg.ctc_decoding.beam.beam_alpha, + beam_beta=self.cfg.ctc_decoding.beam.beam_beta, + word_kenlm_path=self.cfg.ctc_decoding.beam.word_kenlm_path, + nemo_kenlm_path=self.cfg.ctc_decoding.beam.nemo_kenlm_path, + preserve_alignments=self.cfg.ctc_decoding.beam.preserve_alignments, + compute_timestamps=self.cfg.ctc_decoding.beam.compute_timestamps, + flashlight_cfg=self.cfg.ctc_decoding.beam.flashlight_cfg, + pyctcdecode_cfg=self.cfg.ctc_decoding.beam.pyctcdecode_cfg, + return_best_hypothesis=self.cfg.ctc_decoding.beam.return_best_hypothesis), + ) + # Update model's decoding strategy + if isinstance(model, EncDecHybridRNNTCTCModel): + model.change_decoding_strategy(model.cfg.decoding, decoder_type='ctc') + else: + model.change_decoding_strategy(model.cfg.decoding) + + + with torch.no_grad(): + if isinstance(model, EncDecHybridRNNTCTCModel): + model.cur_decoder = 'ctc' + + override_cfg = model.get_transcribe_config() + override_cfg.batch_size = self.batch_size + override_cfg.return_hypotheses = True + + all_hypotheses = model.transcribe(audio_file_paths, override_config=override_cfg) + if type(all_hypotheses) == tuple and len(all_hypotheses) == 2: # if transcriptions form a tuple of (best_hypotheses, all_hypotheses) + all_hypotheses = all_hypotheses[1] + + pred_texts = [] + for hypotheses in all_hypotheses: + pred_text = [hyp.text for hyp in hypotheses] + pred_texts.append(pred_text) + + + for item, t in zip(manifest, pred_texts): + item[self.output_text_key] = t + fout.write(json.dumps(item, ensure_ascii=False) + '\n') class RestorePCbyTopN(BaseParallelProcessor): """ From e68d3fe4841f4e975e4dc60a58531c2c3cfcc5b0 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Thu, 16 May 2024 11:49:48 -0700 Subject: [PATCH 102/115] get_capitalisation_from_target Signed-off-by: Nikolay Karpov --- dataset_configs/youtube/beamsearch.yaml | 6 +++--- sdp/processors/nemo/beamsearch_inference.py | 14 ++++++++++++-- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/dataset_configs/youtube/beamsearch.yaml b/dataset_configs/youtube/beamsearch.yaml index 8cbf1e72..9a181038 100644 --- a/dataset_configs/youtube/beamsearch.yaml +++ b/dataset_configs/youtube/beamsearch.yaml @@ -4,8 +4,8 @@ workspace_dir: ??? processors: - _target_: sdp.processors.nemo.beamsearch_inference.BeamsearchTopNInference in_memory_chunksize: 10000 - input_manifest_file: /mnt/md1/YTDS/ES/clean_data/es_manifest.json - output_manifest_file: ${workspace_dir}/tmp_manifest1.json + input_manifest_file: /mnt/md1/YTDS/ES/clean_data/es_manifest_no_punct.json + output_manifest_file: ${workspace_dir}/es_manifest_topn.json input_audio_key: audio_filepath output_text_key: pred_texts batch_size: 64 @@ -14,7 +14,7 @@ processors: - _target_: sdp.processors.nemo.beamsearch_inference.RestorePCbyTopN - output_manifest_file: ${workspace_dir}/tmp_manifest2.json + output_manifest_file: ${workspace_dir}/es_manifest_restored_punct.json text_without_pc_key: text texts_with_pc_key: pred_texts output_text_key: pred_text diff --git a/sdp/processors/nemo/beamsearch_inference.py b/sdp/processors/nemo/beamsearch_inference.py index 4e63a7d7..99f72e48 100644 --- a/sdp/processors/nemo/beamsearch_inference.py +++ b/sdp/processors/nemo/beamsearch_inference.py @@ -78,7 +78,7 @@ class EvalBeamSearchNGramConfig: strategy="flashlight", # gready, beam = pyctcdecode, flashlight beam = ctc_beam_decoding.BeamCTCInferConfig( nemo_kenlm_path="/mnt/md1/YTDS/ES/lm/lm.kenlm", - beam_size=4, + beam_size=16, beam_alpha=0.5, # LM weight beam_beta=0.5, # length weight return_best_hypothesis = False, @@ -284,6 +284,15 @@ def prepare(self): if self.punctuation: self.patterns = re.compile("["+self.punctuation+"]") + def get_capitalisation_from_target(self, text_input, text_to_fix): + text_input = text_input.strip() + text_to_fix = text_to_fix.strip() + if text_input[0].isupper(): + text_to_fix = text_to_fix[0].upper()+text_to_fix[1:] + + return text_to_fix + + def process_dataset_entry(self, data_entry): text_without_pc = data_entry[self.text_without_pc_key] texts_with_pc = data_entry[self.texts_with_pc_key] @@ -302,6 +311,7 @@ def process_dataset_entry(self, data_entry): ldists.append(ldist) texts.append(text) - data_entry[self.output_text_key] = texts[np.argmin(ldists)] + text_with_pc = self.get_capitalisation_from_target(text_without_pc, texts_with_pc[np.argmin(ldists)]) + data_entry[self.output_text_key] = text_with_pc return [DataEntry(data=data_entry)] \ No newline at end of file From 0b34b9ff9c5e4d0d11a5ae8da0738b17b8314b21 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Fri, 17 May 2024 02:08:29 -0700 Subject: [PATCH 103/115] ConcatManifests Signed-off-by: Nikolay Karpov --- dataset_configs/youtube/beamsearch.yaml | 15 ++++- sdp/processors/nemo/beamsearch_inference.py | 73 ++++++++++++++------- 2 files changed, 63 insertions(+), 25 deletions(-) diff --git a/dataset_configs/youtube/beamsearch.yaml b/dataset_configs/youtube/beamsearch.yaml index 9a181038..b9eaadc1 100644 --- a/dataset_configs/youtube/beamsearch.yaml +++ b/dataset_configs/youtube/beamsearch.yaml @@ -12,11 +12,22 @@ processors: device: cuda model_path: /mnt/md1/YTDS/ES/lm/stt_es_fastconformer_hybrid_large_pc.nemo - - _target_: sdp.processors.nemo.beamsearch_inference.RestorePCbyTopN output_manifest_file: ${workspace_dir}/es_manifest_restored_punct.json text_without_pc_key: text texts_with_pc_key: pred_texts output_text_key: pred_text punctuation: ",.?" - do_lower: true \ No newline at end of file + do_lower: true + + - _target_: sdp.processors.KeepOnlySpecifiedFields + fields_to_keep: ["audio_filepath", "duration", "text", "pred_text"] + + - _target_: sdp.processors.RenameFields + output_manifest_file: ${workspace_dir}/es_manifest_restored_punct_renamed.json + rename_fields: {"pred_text": "text"} + + - _target_: sdp.processors.nemo.beamsearch_inference.ConcatManifests + input_manifest_files: ["${workspace_dir}/es_manifest_restored_punct_renamed.json", "${workspace_dir}/es_manifest_with_punct.json"] + output_manifest_file: ${workspace_dir}/es_manifest_concat.json + \ No newline at end of file diff --git a/sdp/processors/nemo/beamsearch_inference.py b/sdp/processors/nemo/beamsearch_inference.py index 99f72e48..cf65e067 100644 --- a/sdp/processors/nemo/beamsearch_inference.py +++ b/sdp/processors/nemo/beamsearch_inference.py @@ -40,14 +40,20 @@ from sdp.processors.base_processor import BaseProcessor, BaseParallelProcessor, DataEntry -def load_manifest(manifest: Path) -> List[Dict[str, Union[str, float]]]: - result = [] - with manifest.open() as f: - for i, line in enumerate(f): - data = json.loads(line) - result.append(data) - return result +def read_manifest(input_manifest_file, encoding): + """Reading the input manifest file. + .. note:: + This function should be overridden in the "initial" class creating + manifest to read from the original source of data. + """ + if input_manifest_file is None: + raise NotImplementedError("Override this method if the processor creates initial manifest") + + with open(input_manifest_file, "rt", encoding=encoding) as fin: + for line in fin: + yield json.loads(line) + @dataclass class EvalBeamSearchNGramConfig: """ @@ -152,7 +158,7 @@ def _chunk_manifest(self): """Splits the manifest into smaller chunks defined by ``in_memory_chunksize``. """ manifest_chunk = [] - for idx, data_entry in enumerate(self.read_manifest(), 1): + for idx, data_entry in enumerate(read_manifest(self.input_manifest_file), 1): manifest_chunk.append(data_entry) if idx % self.in_memory_chunksize == 0: yield manifest_chunk @@ -160,20 +166,6 @@ def _chunk_manifest(self): if len(manifest_chunk) > 0: yield manifest_chunk - def read_manifest(self): - """Reading the input manifest file. - - .. note:: - This function should be overridden in the "initial" class creating - manifest to read from the original source of data. - """ - if self.input_manifest_file is None: - raise NotImplementedError("Override this method if the processor creates initial manifest") - - with open(self.input_manifest_file, "rt", encoding="utf8") as fin: - for line in fin: - yield json.loads(line) - def process(self): if self.pretrained_name: model = EncDecHybridRNNTCTCModel.from_pretrained(self.pretrained_name) @@ -314,4 +306,39 @@ def process_dataset_entry(self, data_entry): text_with_pc = self.get_capitalisation_from_target(text_without_pc, texts_with_pc[np.argmin(ldists)]) data_entry[self.output_text_key] = text_with_pc return [DataEntry(data=data_entry)] - \ No newline at end of file + +class ConcatManifests(BaseProcessor): + """Adds predictions of a text-based punctuation and capitalization (P&C) model. + + Operates on the text in the ``input_text_field``, and saves predictions in + the ``output_text_field``. + + Args: + input_audio_key (str): the text field that will be the input to the P&C model. + + .. note:: + Either ``pretrained_name`` or ``model_path`` have to be specified. + + Returns: + The same data as in the input manifest with an additional field + containing P&C model's predictions. + """ + + def __init__( + self, + input_manifest_files: List[str], + encoding: str = "utf8", + ensure_ascii: bool = False, + **kwargs, + ): + super().__init__(**kwargs) + self.input_manifest_files = input_manifest_files + self.encoding = encoding + self.ensure_ascii = ensure_ascii + + def process(self): + Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True) + with open(self.output_manifest_file, "wt", encoding=self.encoding) as fout: + for input_manifest_file in self.input_manifest_files: + for idx, data_entry in enumerate(read_manifest(input_manifest_file, self.encoding)): + fout.write(json.dumps(data_entry, ensure_ascii=self.ensure_ascii) + '\n') From 4aeb88c0483f9adc8c33b1cc2a52cda385b20fb9 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Tue, 21 May 2024 03:49:45 -0700 Subject: [PATCH 104/115] utf8 Signed-off-by: Nikolay Karpov --- sdp/processors/nemo/beamsearch_inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdp/processors/nemo/beamsearch_inference.py b/sdp/processors/nemo/beamsearch_inference.py index cf65e067..a46c7576 100644 --- a/sdp/processors/nemo/beamsearch_inference.py +++ b/sdp/processors/nemo/beamsearch_inference.py @@ -158,7 +158,7 @@ def _chunk_manifest(self): """Splits the manifest into smaller chunks defined by ``in_memory_chunksize``. """ manifest_chunk = [] - for idx, data_entry in enumerate(read_manifest(self.input_manifest_file), 1): + for idx, data_entry in enumerate(read_manifest(self.input_manifest_file, encoding="utf8"), 1): manifest_chunk.append(data_entry) if idx % self.in_memory_chunksize == 0: yield manifest_chunk From 421bad6c328038c89d8971e5f8506e51406c9c6a Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Mon, 27 May 2024 05:12:36 -0700 Subject: [PATCH 105/115] shell bool Signed-off-by: Nikolay Karpov --- .../datasets/commoncrawl/commoncrawl.py | 25 +++++++++++-------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py index 35b0385c..7d9af7ed 100644 --- a/sdp/processors/datasets/commoncrawl/commoncrawl.py +++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py @@ -898,6 +898,7 @@ def __init__( input_manifest_arg: str = "", output_manifest_arg: str = "", arg_separator: str = "=", + shell: bool = False, **kwargs, ): super().__init__(**kwargs) @@ -905,18 +906,17 @@ def __init__( self.output_manifest_arg = output_manifest_arg self.arg_separator = arg_separator self.cmd = cmd + self.shell = shell def process(self): os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True) - if self.cmd.find(self.input_manifest_file) != -1 or self.cmd.find(self.output_manifest_file) != -1: - logger.error( - "input_manifest_file " - + self.input_manifest_file - + " and output_manifest_file " - + self.output_manifest_file - + " should be exluded from cmd line!" - ) - raise ValueError + # if self.cmd.find(self.input_manifest_file) != -1 or self.cmd.find(self.output_manifest_file) != -1: + # raise ValueError("input_manifest_file " + # + self.input_manifest_file + # + " and output_manifest_file " + # + self.output_manifest_file + # + " should be exluded from cmd line: " + # + self.cmd) process_args = [x for x in self.cmd.split(" ") if x] if self.arg_separator == " ": if self.input_manifest_arg: @@ -928,8 +928,11 @@ def process(self): process_args.extend([self.input_manifest_arg + self.arg_separator + self.input_manifest_file]) if self.output_manifest_arg: process_args.extend([self.output_manifest_arg + self.arg_separator + self.output_manifest_file]) - - subprocess.run(process_args) + if self.shell: + process_args = " ".join(process_args) + logger.info("subprocess shell: " + process_args) + + subprocess.run(process_args, shell=self.shell) class NmtSubprocess(Subprocess): From a3e56d755b5a4200a128d90e21ee57ab1e3d7d3c Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Wed, 29 May 2024 05:46:34 -0700 Subject: [PATCH 106/115] LangIdWhisper Signed-off-by: Nikolay Karpov --- sdp/processors/__init__.py | 2 +- .../huggingface/speech_recognition.py | 77 +++++++++++++++++++ 2 files changed, 78 insertions(+), 1 deletion(-) diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py index 58cae45b..aeb9b119 100644 --- a/sdp/processors/__init__.py +++ b/sdp/processors/__init__.py @@ -32,7 +32,7 @@ from sdp.processors.datasets.voxpopuli.normalize_from_non_pc_text import ( NormalizeFromNonPCTextVoxpopuli, ) -from sdp.processors.huggingface.speech_recognition import ASRTransformers, ASRWhisper +from sdp.processors.huggingface.speech_recognition import ASRTransformers, ASRWhisper, LangIdWhisper from sdp.processors.modify_manifest.common import ( AddConstantFields, ChangeToRelativePath, diff --git a/sdp/processors/huggingface/speech_recognition.py b/sdp/processors/huggingface/speech_recognition.py index 12a4e5fa..ae8ea0d7 100644 --- a/sdp/processors/huggingface/speech_recognition.py +++ b/sdp/processors/huggingface/speech_recognition.py @@ -13,15 +13,92 @@ # limitations under the License. import json +import librosa from pathlib import Path +from collections import Counter from tqdm import tqdm +import soundfile as sf +import numpy as np from sdp.logging import logger from sdp.processors.base_processor import BaseProcessor from sdp.utils.common import load_manifest +class LangIdWhisper(BaseProcessor): + """ + Processor to get Lang ID using ASR Whisper model from HuggingFace. + + Args: + pretrained_model (str): name of pretrained model on HuggingFace. + output_lang_key (str): field to save language ID result. + device (str): Inference device. + """ + + def __init__( + self, + pretrained_model: str, + output_lang_key: str, + device: str = None, + **kwargs, + ): + super().__init__(**kwargs) + try: + import torch + import whisper + except: + raise ImportError("Need to install whisper: pip install -U openai-whisper") + + logger.warning("This is an example processor, for demonstration only. Do not use it for production purposes.") + self.whisper = whisper + self.pretrained_model = pretrained_model + self.device = device + self.output_lang_key = output_lang_key + + if self.device is None: + if torch.cuda.is_available(): + self.device = "cuda" + else: + self.device = "cpu" + self.model = whisper.load_model(self.pretrained_model) + + def process(self): + json_list = load_manifest(Path(self.input_manifest_file)) + + Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True) + + with Path(self.output_manifest_file).open('w') as f: + for item in tqdm(json_list): + pred_lang = self.segment(item["audio_filepath"], segment_duration=30, num_segments=3, random_seed=None) + item[self.output_lang_key] = pred_lang + f.write(json.dumps(item, ensure_ascii=False) + '\n') + + + def segment(self, path2audio_file, segment_duration, num_segments, random_seed): + audio, sr = sf.read(path2audio_file) + audio = np.float32(audio) + + audio_length = audio.shape[0] + + duration = sr * segment_duration + if duration > audio_length: + duration = audio_length + label_id_list = [] + np.random.seed(random_seed) + starts = np.random.randint(0, audio_length - duration + 1, size=num_segments) + for start in starts: + audio_segm = audio[start : start + duration] + audio_segm = self.whisper.pad_or_trim(audio_segm) + mel = self.whisper.log_mel_spectrogram(audio_segm) + mel = mel.to(self.device) + _, probs = self.model.detect_language(mel) + lang = max(probs, key=probs.get) + label_id_list.append(lang) + + m_label_id = Counter(label_id_list).most_common(1)[0][0] + return m_label_id + class ASRWhisper(BaseProcessor): """ Simple example to transcribe using ASR Whisper model from HuggingFace. From 52c85521b07f750c619837a64af107eb0c7c1d42 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Fri, 31 May 2024 16:30:30 -0700 Subject: [PATCH 107/115] black Signed-off-by: Nikolay Karpov --- sdp/processors/datasets/commoncrawl/commoncrawl.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py index 7d9af7ed..e520489e 100644 --- a/sdp/processors/datasets/commoncrawl/commoncrawl.py +++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py @@ -899,6 +899,7 @@ def __init__( output_manifest_arg: str = "", arg_separator: str = "=", shell: bool = False, + dont_wait: bool = False, **kwargs, ): super().__init__(**kwargs) @@ -907,6 +908,7 @@ def __init__( self.arg_separator = arg_separator self.cmd = cmd self.shell = shell + self.dont_wait = dont_wait def process(self): os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True) @@ -931,8 +933,12 @@ def process(self): if self.shell: process_args = " ".join(process_args) logger.info("subprocess shell: " + process_args) - - subprocess.run(process_args, shell=self.shell) + + if self.dont_wait: + logger.warning("dont_wait flag is True, no logs captures!") + subprocess.Popen(process_args, shell=self.shell, stdin=None, stdout=None, stderr=None, close_fds=True) + else: + subprocess.run(process_args, shell=self.shell) class NmtSubprocess(Subprocess): From dc64941127f3047a291ac19aa65bf399ebe3dc4c Mon Sep 17 00:00:00 2001 From: Sasha Meister <117230141+ssh-meister@users.noreply.github.com> Date: Sat, 1 Jun 2024 17:02:25 +0200 Subject: [PATCH 108/115] Updated LangIDWhisper processor (#62) Signed-off-by: Sasha Meister Co-authored-by: Sasha Meister --- .../huggingface/speech_recognition.py | 60 ++++++++++++++----- 1 file changed, 46 insertions(+), 14 deletions(-) diff --git a/sdp/processors/huggingface/speech_recognition.py b/sdp/processors/huggingface/speech_recognition.py index ae8ea0d7..3f9907aa 100644 --- a/sdp/processors/huggingface/speech_recognition.py +++ b/sdp/processors/huggingface/speech_recognition.py @@ -13,7 +13,6 @@ # limitations under the License. import json -import librosa from pathlib import Path from collections import Counter @@ -40,6 +39,9 @@ def __init__( pretrained_model: str, output_lang_key: str, device: str = None, + segment_duration: float = np.inf, + num_segments: int = 1, + random_seed: int = None, **kwargs, ): super().__init__(**kwargs) @@ -54,6 +56,9 @@ def __init__( self.pretrained_model = pretrained_model self.device = device self.output_lang_key = output_lang_key + self.segment_duration = segment_duration + self.num_segments = num_segments + self.random_seed = random_seed if self.device is None: if torch.cuda.is_available(): @@ -69,35 +74,62 @@ def process(self): with Path(self.output_manifest_file).open('w') as f: for item in tqdm(json_list): - pred_lang = self.segment(item["audio_filepath"], segment_duration=30, num_segments=3, random_seed=None) + pred_lang = self.get_label(item["audio_filepath"]) item[self.output_lang_key] = pred_lang f.write(json.dumps(item, ensure_ascii=False) + '\n') - def segment(self, path2audio_file, segment_duration, num_segments, random_seed): - audio, sr = sf.read(path2audio_file) + def get_label(self, path2audio_file): + audio, sample_rate = sf.read(path2audio_file) audio = np.float32(audio) audio_length = audio.shape[0] - duration = sr * segment_duration - if duration > audio_length: - duration = audio_length + audio_segment_samples = sample_rate * self.segment_duration + segments_in_audio = int(audio_length / audio_segment_samples) + segment_starts = [] + segment_ends = [] + + np.random.seed(self.random_seed) + + if segments_in_audio <= 1: + segment_starts = [0] + segment_ends = [audio_length] + else: + if segments_in_audio > self.num_segments: + segments_in_audio = self.num_segments + + long_segment_duration = int(audio_length / segments_in_audio) + + for segment_no in range(segments_in_audio): + long_start_segment = long_segment_duration * segment_no + long_end_segment = long_segment_duration * (segment_no + 1) + segment_start = np.random.randint(long_start_segment, long_end_segment - audio_segment_samples) + segment_end = segment_start + audio_segment_samples + segment_starts.append(segment_start) + segment_ends.append(segment_end) + + label_id_list = [] - np.random.seed(random_seed) - starts = np.random.randint(0, audio_length - duration + 1, size=num_segments) - for start in starts: - audio_segm = audio[start : start + duration] - audio_segm = self.whisper.pad_or_trim(audio_segm) - mel = self.whisper.log_mel_spectrogram(audio_segm) + + n_mels = 80 + + if self.pretrained_model = "large-v3": + n_mels=128 + + for segment_start, segment_end in zip(segment_starts, segment_ends): + audio_segement = audio[segment_start:segment_end] + audio_segement = self.whisper.pad_or_trim(audio_segement) + mel = self.whisper.log_mel_spectrogram(audio_segement, n_mels) mel = mel.to(self.device) _, probs = self.model.detect_language(mel) lang = max(probs, key=probs.get) label_id_list.append(lang) - + m_label_id = Counter(label_id_list).most_common(1)[0][0] return m_label_id + class ASRWhisper(BaseProcessor): """ From bb28efce6f9509cc17fe82e364473f592436b909 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Wed, 4 Sep 2024 05:52:29 -0700 Subject: [PATCH 109/115] kenlm_path fix Signed-off-by: Nikolay Karpov --- sdp/processors/huggingface/speech_recognition.py | 2 +- sdp/processors/nemo/beamsearch_inference.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/sdp/processors/huggingface/speech_recognition.py b/sdp/processors/huggingface/speech_recognition.py index e2fcdb61..68e94680 100644 --- a/sdp/processors/huggingface/speech_recognition.py +++ b/sdp/processors/huggingface/speech_recognition.py @@ -115,7 +115,7 @@ def get_label(self, path2audio_file): n_mels = 80 - if self.pretrained_model = "large-v3": + if self.pretrained_model == "large-v3": n_mels=128 for segment_start, segment_end in zip(segment_starts, segment_ends): diff --git a/sdp/processors/nemo/beamsearch_inference.py b/sdp/processors/nemo/beamsearch_inference.py index a46c7576..3eb5c5fa 100644 --- a/sdp/processors/nemo/beamsearch_inference.py +++ b/sdp/processors/nemo/beamsearch_inference.py @@ -83,7 +83,7 @@ class EvalBeamSearchNGramConfig: ctc_decoding: CTCDecodingConfig = field(default_factory=lambda: CTCDecodingConfig( strategy="flashlight", # gready, beam = pyctcdecode, flashlight beam = ctc_beam_decoding.BeamCTCInferConfig( - nemo_kenlm_path="/mnt/md1/YTDS/ES/lm/lm.kenlm", + kenlm_path="/mnt/md1/YTDS/ES/lm/lm.kenlm", beam_size=16, beam_alpha=0.5, # LM weight beam_beta=0.5, # length weight @@ -207,8 +207,8 @@ def process(self): beam = ctc_beam_decoding.BeamCTCInferConfig(beam_size=self.cfg.ctc_decoding.beam.beam_size, beam_alpha=self.cfg.ctc_decoding.beam.beam_alpha, beam_beta=self.cfg.ctc_decoding.beam.beam_beta, - word_kenlm_path=self.cfg.ctc_decoding.beam.word_kenlm_path, - nemo_kenlm_path=self.cfg.ctc_decoding.beam.nemo_kenlm_path, + kenlm_path=self.cfg.ctc_decoding.beam.kenlm_path, + kenlm_type=self.cfg.ctc_decoding.beam.kenlm_type, preserve_alignments=self.cfg.ctc_decoding.beam.preserve_alignments, compute_timestamps=self.cfg.ctc_decoding.beam.compute_timestamps, flashlight_cfg=self.cfg.ctc_decoding.beam.flashlight_cfg, From c71f558ef00ebfd972c137b86c806d7ad3b0b325 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Fri, 27 Sep 2024 15:13:29 -0700 Subject: [PATCH 110/115] add ApplyLlama3 and pnc pipeline Signed-off-by: Nikolay Karpov --- dataset_configs/commoncrawl/pnc.yaml | 85 +++++++++++ sdp/processors/huggingface/llm.py | 217 +++++++++++++++++++++++++++ 2 files changed, 302 insertions(+) create mode 100644 dataset_configs/commoncrawl/pnc.yaml create mode 100644 sdp/processors/huggingface/llm.py diff --git a/dataset_configs/commoncrawl/pnc.yaml b/dataset_configs/commoncrawl/pnc.yaml new file mode 100644 index 00000000..72174eb6 --- /dev/null +++ b/dataset_configs/commoncrawl/pnc.yaml @@ -0,0 +1,85 @@ +processors_to_run: "0:" + +WINDOW: 8000 +OFFSET: 0 +THRESHOLD: -5 +MAX_DURATION: 40 +MAX_SILENCE: 1.0 # 1.5 + +MODEL: "stt_en_citrinet_512_gamma_0_25" +NEMO_DIR_PATH: /home/nkarpov/workspace/NeMo_main +TOOLS_DIR: ${NEMO_DIR_PATH}/tools/ctc_segmentation/scripts +DATA_DIR: /mnt/ssd8/multilang/en/val_test/mls/test +workspace_dir: ${DATA_DIR}/manifests + + +processors: + - _target_: sdp.processors.DuplicateFields + input_manifest_file: /mnt/ssd8/multilang/en/val_test/mls/test/mls_test_pc_head.json + duplicate_fields: {"text": "text_pc"} + + - _target_: sdp.processors.SubMakeLowercase + text_key: "text" + + - _target_: sdp.processors.SubRegex + text_key: text + regex_params_list: + - {"pattern": "[\\?\\.]", "repl": " "} + - {"pattern": ",", "repl": " "} + - {"pattern": "\\s+", "repl": " "} + + - _target_: sdp.processors.KeepOnlySpecifiedFields + output_manifest_file: ${workspace_dir}/mls_test_example.json + fields_to_keep: ["text", "text_pc", "audio_filepath", "duration"] + + # 4 + - _target_: sdp.processors.huggingface.llm.ApplyLlama3 # pip install num2words huggingface_hub; huggingface-cli; login hf_... + input_manifest_file: /mnt/ssd8/multilang/en/val_test/mls/test/mls_test_nopc.json + input_example_manifest: ${workspace_dir}/mls_test_example.json + example_query_key: "text" + example_response_key: "text_pc" + pretrained_model: "meta-llama/Meta-Llama-3-8B-Instruct" + input_text_key: "text" + main_promt: [ + "Your task is to punctuate the text.", + "You must not change the words in the text.", + "Just add punctuations.", + "You can only use a period, comma or question mark as punctuation.", + "Add capitalization to the beginning of the sentence if necessary.", + "Do not use too long sentences, try to insert period mark.", + "Do not reduce the number of input words", + "Do not add your own comments in the beggining of the answer" + ] + torch_dtype: "float16" + output_text_key: "text_pc" + output_manifest_file: ${workspace_dir}/manifest_pc.json + # 5 + - _target_: sdp.processors.huggingface.llm.WriteTxtFiles + text_key: text_pc + audio_key: audio_filepath + output_dir: ${DATA_DIR}/text + + - _target_: sdp.processors.huggingface.llm.Subprocess + cmd: "python ${TOOLS_DIR}/prepare_data.py \ + --in_text=${DATA_DIR}/text \ + --output_dir=${DATA_DIR}/processed/ \ + --language=en \ + --model=${MODEL} \ + --additional_split_symbols='.' \ + --audio_dir=${DATA_DIR}/wav" + + - _target_: sdp.processors.huggingface.llm.Subprocess + cmd: "python ${TOOLS_DIR}/run_ctc_segmentation.py \ + --output_dir=${DATA_DIR}/output \ + --data=${DATA_DIR}/for_ctc_segmentation \ + --model=${MODEL} \ + --window_len=${WINDOW}" + + - _target_: sdp.processors.huggingface.llm.Subprocess + cmd: "python ${TOOLS_DIR}/cut_audio_with_combain_segments.py \ + --output_dir=${DATA_DIR}/output \ + --alignment=${DATA_DIR}/output/segments/ \ + --threshold=${THRESHOLD} \ + --max_duration=${MAX_DURATION} \ + --offset=${OFFSET} \ + --max_silence=${MAX_SILENCE}" \ No newline at end of file diff --git a/sdp/processors/huggingface/llm.py b/sdp/processors/huggingface/llm.py new file mode 100644 index 00000000..b71286f3 --- /dev/null +++ b/sdp/processors/huggingface/llm.py @@ -0,0 +1,217 @@ +import json +import os +import subprocess +from pathlib import Path +from typing import Dict, List, Optional, Union + +from sdp.logging import logger +from sdp.processors.base_processor import ( + BaseParallelProcessor, + BaseProcessor, + DataEntry, +) +from sdp.utils.common import load_manifest + + +class ApplyLlama3(BaseProcessor): + """ + Processor to prompt llm model from HuggingFace. + + Args: + input_example_manifest (str): Assistent example manifest file. + example_query_key (str): Field name that contains examples queries. + example_response_key (str): Field name that contains examples ground truth responses. + pretrained_model (str): Pretrained model name. + input_text_key (str): Field name that contains input text. + message (str): LLM command text. + torch_dtype (str): Tensor data type. Default to "float16" (as llama3 is trained so). + output_text_key (str): Key to save result. + """ + + def __init__( + self, + input_example_manifest: str = None, + example_query_key: str = "text", + example_response_key: str = "text_pc", + pretrained_model: str = "meta-llama/Meta-Llama-3-8B-Instruct", + input_text_key: str = "text", + main_promt: List[str] = [ + "Add missing punctuation marks. Don't change the words of the text. Keep the text as it is." + ], + torch_dtype: str = "float16", + output_text_key: str = "text_pc", + **kwargs, + ): + super().__init__(**kwargs) + try: + import torch + import transformers + except: + raise ImportError("Need to install transformers: pip install accelerate transformers") + + logger.warning("This is an example processor, for demonstration only. Do not use it for production purposes.") + self.pretrained_model = pretrained_model + self.example_query_key = example_query_key + self.example_response_key = example_response_key + self.input_example_manifest = input_example_manifest + self.input_text_key = input_text_key + self.output_text_key = output_text_key + self.message = " ".join(main_promt) + if torch_dtype == "float32": + self.torch_dtype = torch.float32 + elif torch_dtype == "float16": + self.torch_dtype = torch.float16 + else: + raise NotImplementedError(torch_dtype + " is not implemented!") + + self.pipeline = transformers.pipeline( + "text-generation", + model=self.pretrained_model, + model_kwargs={"torch_dtype": self.torch_dtype}, + device="cuda", + ) + + self.messages = [{"role": "system", "content": self.message}] + if self.input_example_manifest: + example_manifest = load_manifest(Path(self.input_example_manifest)) + for data_entry in example_manifest: + self.messages.append({"role": "user", "content": data_entry[self.example_query_key]}) + self.messages.append({"role": "assistant", "content": data_entry[self.example_response_key]}) + + def process(self): + data_entries = load_manifest(Path(self.input_manifest_file)) + + with Path(self.output_manifest_file).open("w") as f: + for data_entry in data_entries: + messages = self.messages.copy() + messages.append({"role": "user", "content": data_entry[self.input_text_key]}) + + prompt = self.pipeline.tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + + terminators = [ + self.pipeline.tokenizer.eos_token_id, + self.pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>"), + ] + + outputs = self.pipeline( + prompt, + max_new_tokens=2 * len(data_entry[self.input_text_key]), + eos_token_id=terminators, + do_sample=True, + temperature=0.6, + top_p=0.9, + ) + + data_entry[self.output_text_key] = outputs[0]["generated_text"][len(prompt) :] + f.write(json.dumps(data_entry, ensure_ascii=False) + "\n") + + +class Subprocess(BaseProcessor): + """ + Processor for handling subprocess execution with additional features for managing input and output manifests. + + Args: + cmd (str): The command to be executed as a subprocess. + input_manifest_arg (str, optional): The argument specifying the input manifest. Defaults to an empty string. + output_manifest_arg (str, optional): The argument specifying the output manifest. Defaults to an empty string. + arg_separator (str, optional): The separator used between argument and value. Defaults to "=". + shell (bool, optional): The argument specifies whether to use shell for subprocess.run(). Defaults to False. + dont_wait (bool, optional): The argument specifies whether to wait while the subprocess finishes. . Defaults to False. + **kwargs: Additional keyword arguments to be passed to the base class. + + Example: + + _target_: sdp.processors.datasets.commoncrawl.Subprocess + output_manifest_file: /workspace/manifest.json + input_manifest_arg: "--manifest" + output_manifest_arg: "--output_filename" + arg_separator: "=" + cmd: "python /workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \ + --language=en --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache --overwrite_cache \ + --whitelist=/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv" + """ + + def __init__( + self, + cmd: str, + input_manifest_arg: str | None = None, + output_manifest_arg: str | None = None, + arg_separator: str = "=", + shell: bool = False, + dont_wait: bool = False, + **kwargs, + ): + super().__init__(**kwargs) + self.input_manifest_arg = input_manifest_arg + self.output_manifest_arg = output_manifest_arg + self.arg_separator = arg_separator + self.cmd = cmd + self.shell = shell + self.dont_wait = dont_wait + + def process(self): + os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True) + if ( + self.input_manifest_arg is not None + and self.cmd.find(self.input_manifest_file) != -1 + or self.output_manifest_arg is not None + and self.cmd.find(self.output_manifest_file) != -1 + ): + raise ValueError( + "input_manifest_file " + + self.input_manifest_file + + " and output_manifest_file " + + self.output_manifest_file + + " should be exluded from cmd line: " + + self.cmd + ) + process_args = [x for x in self.cmd.split(" ") if x] + if self.arg_separator == " ": + if self.input_manifest_arg: + process_args.extend([self.input_manifest_arg, self.input_manifest_file]) + if self.output_manifest_arg: + process_args.extend([self.output_manifest_arg, self.output_manifest_file]) + else: + if self.input_manifest_arg: + process_args.extend([self.input_manifest_arg + self.arg_separator + self.input_manifest_file]) + if self.output_manifest_arg: + process_args.extend([self.output_manifest_arg + self.arg_separator + self.output_manifest_file]) + if self.shell: + process_args = " ".join(process_args) + logger.info("subprocess shell: " + process_args) + + if self.dont_wait: + logger.warning("dont_wait flag is True, no logs captures!") + subprocess.Popen(process_args, shell=self.shell, stdin=None, stdout=None, stderr=None, close_fds=True) + else: + subprocess.run(process_args, shell=self.shell) + + +class WriteTxtFiles(BaseParallelProcessor): + """ """ + + def __init__( + self, + text_key: Dict, + audio_key: Dict, + output_dir: str, + **kwargs, + ): + super().__init__(**kwargs) + self.audio_key = audio_key + self.text_key = text_key + self.output_dir = output_dir + + def prepare(self): + os.makedirs(self.output_dir, exist_ok=True) + + def process_dataset_entry(self, data_entry: Dict): + text = data_entry[self.text_key] + audiofile_path = data_entry[self.audio_key] + base_name = os.path.splitext(os.path.split(audiofile_path)[1])[0] + output_name = os.path.join(self.output_dir, base_name + ".txt") + with open(output_name, 'w') as file: + file.write(text) + return [DataEntry(data=data_entry)] From 3810d3b80d45136d69dd217afe8256dadf3db71d Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Sun, 24 Nov 2024 03:09:37 -0800 Subject: [PATCH 111/115] rm yt' Signed-off-by: Nikolay Karpov --- dataset_configs/youtube/beamsearch.yaml | 33 ---- dataset_configs/youtube/de.yaml | 253 ------------------------ 2 files changed, 286 deletions(-) delete mode 100644 dataset_configs/youtube/beamsearch.yaml delete mode 100644 dataset_configs/youtube/de.yaml diff --git a/dataset_configs/youtube/beamsearch.yaml b/dataset_configs/youtube/beamsearch.yaml deleted file mode 100644 index b9eaadc1..00000000 --- a/dataset_configs/youtube/beamsearch.yaml +++ /dev/null @@ -1,33 +0,0 @@ -processors_to_run: "0:" -workspace_dir: ??? - -processors: - - _target_: sdp.processors.nemo.beamsearch_inference.BeamsearchTopNInference - in_memory_chunksize: 10000 - input_manifest_file: /mnt/md1/YTDS/ES/clean_data/es_manifest_no_punct.json - output_manifest_file: ${workspace_dir}/es_manifest_topn.json - input_audio_key: audio_filepath - output_text_key: pred_texts - batch_size: 64 - device: cuda - model_path: /mnt/md1/YTDS/ES/lm/stt_es_fastconformer_hybrid_large_pc.nemo - - - _target_: sdp.processors.nemo.beamsearch_inference.RestorePCbyTopN - output_manifest_file: ${workspace_dir}/es_manifest_restored_punct.json - text_without_pc_key: text - texts_with_pc_key: pred_texts - output_text_key: pred_text - punctuation: ",.?" - do_lower: true - - - _target_: sdp.processors.KeepOnlySpecifiedFields - fields_to_keep: ["audio_filepath", "duration", "text", "pred_text"] - - - _target_: sdp.processors.RenameFields - output_manifest_file: ${workspace_dir}/es_manifest_restored_punct_renamed.json - rename_fields: {"pred_text": "text"} - - - _target_: sdp.processors.nemo.beamsearch_inference.ConcatManifests - input_manifest_files: ["${workspace_dir}/es_manifest_restored_punct_renamed.json", "${workspace_dir}/es_manifest_with_punct.json"] - output_manifest_file: ${workspace_dir}/es_manifest_concat.json - \ No newline at end of file diff --git a/dataset_configs/youtube/de.yaml b/dataset_configs/youtube/de.yaml deleted file mode 100644 index 333536b1..00000000 --- a/dataset_configs/youtube/de.yaml +++ /dev/null @@ -1,253 +0,0 @@ -processors_to_run: "3:" -base_dir: "/data/supervised/2/audios" -workspace_dir: "/data/processed/2" - -# filters -lang: de -min_duration: 1.0 -max_duration: 40.0 -max_wer: 75.0 -max_cer: 30.0 - - -processors: - # Create initial manifests based on pairs of .opus audio + .srt transcript (with ground-truth timestamps) - - _target_: sdp.processors.datasets.youtube.CreateInitialManifest - data_dir: ${base_dir} - output_audio_dir: ${workspace_dir}/audio/wav_samples - output_manifest_file: ${workspace_dir}/manifest1.json - chunksize: 10 - in_memory_chunksize: 400 - - # Aggregate ground-truth segments to longer one based on duration threshold - - _target_: sdp.processors.datasets.youtube.AggregateSegments - max_duration: ${max_duration} - output_segments_audio_dir: ${workspace_dir}/audio/wav_segments - output_manifest_file: ${workspace_dir}/manifest2.json - - # Filter out samples which duration is out of range 0-40 sec. - - _target_: sdp.processors.DropHighLowDuration - output_manifest_file: ${workspace_dir}/manifest3.json - low_duration_threshold: ${min_duration} - high_duration_threshold: ${max_duration} - - # Identify language of the text - - _target_: sdp.processors.datasets.commoncrawl.TextLid - output_manifest_file: ${workspace_dir}/manifest4.json - input_text_key: orig_text - output_lang_key: text_lang - pretrained_model: "jb2k/bert-base-multilingual-cased-language-detection" - device: cuda - drop_text_duplicates: True - - - _target_: sdp.processors.datasets.commoncrawl.Lang2Iso - output_manifest_file: ${workspace_dir}/manifest5.json - input_lang_key: text_lang - output_lang_key: text_lang - - ## Filter out samples with text in non-target language - - _target_: sdp.processors.PreserveByValue - output_manifest_file: ${workspace_dir}/manifest6.json - input_value_key: text_lang - target_value: ${lang} - - # Identify language of the audio - - _target_: sdp.processors.datasets.commoncrawl.AudioLid - output_manifest_file: ${workspace_dir}/manifest7.json - input_audio_key: audio_filepath - output_lang_key: audio_lang - device: cuda - pretrained_model: "langid_ambernet" - - ## Filter out samples with audio in non-target language - - _target_: sdp.processors.PreserveByValue - output_manifest_file: ${workspace_dir}/manifest8.json - input_value_key: audio_lang - target_value: ${lang} - - # ASR Inference - - _target_: sdp.processors.ASRInferenceParallel - output_manifest_file: ${workspace_dir}/manifest9.json - pretrained_model: nvidia/stt_${lang}_fastconformer_hybrid_large_pc - batch_size: 64 - devices: 4 - - ## Merge manifests - - _target_: sdp.processors.datasets.youtube.MergeManifests - input_manifest_file: ${workspace_dir}/manifest8.json - input_manifest_file2: ${workspace_dir}/manifest9.json - output_manifest_file: ${workspace_dir}/manifest10.json - key_field: audio_filepath - fields_to_merge: - - {"pred_text" : "pred_text_pc"} - - # Filter out samples with empty pred_text_pc - - _target_: sdp.processors.DropIfRegexMatch - output_manifest_file: ${workspace_dir}/manifest11.json - text_key: pred_text_pc - regex_patterns: - - "^\\s*$" - - # Preprocess orig text for audio-based TN - - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest12.json - duplicate_fields: {"orig_text" : "pre_normalized"} - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest13.json - text_key: pre_normalized - regex_params_list: - - {"pattern": '\\[hn]', "repl" : " "} - - {"pattern": "\\s+", "repl" : " "} - - {"pattern": "\\[", "repl" : " "} - - {"pattern": "\\]", "repl" : " "} - - {"pattern": "!", "repl" : "."} - - {"pattern": "\\)", "repl" : " "} - - {"pattern": "\\(", "repl" : " "} - - {"pattern": "“", "repl" : " "} - - {"pattern": "„", "repl" : " "} - - {"pattern": "–", "repl" : " "} - - {"pattern": ";", "repl" : ","} - - {"pattern": "'", "repl" : " "} - - {"pattern": "…", "repl" : "."} - - {"pattern": "«", "repl" : " "} - - {"pattern": "»", "repl" : " "} - - {"pattern": "’", "repl" : " "} - - {"pattern": "‘", "repl" : " "} - - {"pattern": "”", "repl" : " "} - - {"pattern": "—", "repl" : " "} - - {"pattern": "´", "repl" : " "} - - {"pattern": "″", "repl" : " "} - - {"pattern": "`", "repl" : " "} - - {"pattern": "\\|", "repl" : " "} - - {"pattern": "−", "repl" : " "} - - {"pattern": "‟", "repl" : " "} - - {"pattern": "‒", "repl" : " "} - - {"pattern": " ", "repl" : " "} - - {"pattern": "", "repl" : " "} - - {"pattern": "‐", "repl" : " "} - - {"pattern": "ʻ", "repl" : " "} - - {"pattern": "′", "repl" : " "} - - {"pattern": "\\\\", "repl" : " "} - - {"pattern": "^\\s?\\.\\.\\.", "repl" : ""} - - {"pattern": "\\s?\\.\\.\\.$", "repl" : "."} - - ## Remove extra space - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest14.json - text_key: pre_normalized - regex_params_list: - - {"pattern": "\\s+", "repl" : " "} - - {"pattern": "^\\s+", "repl" : ""} - - {"pattern": "\\s+$", "repl" : ""} - - ## Filter out samples out of Regex - - _target_: sdp.processors.DropIfNoneOfRegexMatch - output_manifest_file: ${workspace_dir}/manifest15.json - text_key: pre_normalized - regex_patterns: - - "^[ !#$%&'*+,\\-.0-9:=?ABCDEFGHIJKLMNOPQRSTUVWXYZ^_abcdefghijklmnopqrstuvwxyz{}~£¥°²³µÄÖÜßäöüμω₩€/]+$" - - # Run audio based TN - - _target_: sdp.processors.datasets.commoncrawl.Subprocess - output_manifest_file: ${workspace_dir}/manifest16.json - input_manifest_arg: "--manifest" - output_manifest_arg: "--output_filename" - arg_separator: "=" - cmd: "python /workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \ - --language=${lang} --n_jobs=-1 --batch_size=600 --manifest_text_field=pre_normalized --manifest_asr_pred_field=pred_text_pc \ - --cache_dir=${workspace_dir}/cache \ - --whitelist=/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/${lang}/data/whitelist.tsv" - - # Post-normalization processing - - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest17.json - duplicate_fields: {"normalized" : "post_normalized"} - - ## Extra chars removing from normalized text - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest18.json - text_key: post_normalized - regex_params_list: - - {"pattern": "['\\-:{}\\/]", "repl" : " "} - - {"pattern": "!", "repl" : "."} - - {"pattern": "\\s+", "repl" : " "} - - {"pattern": "^\\s+", "repl" : ""} - - {"pattern": "\\s+$", "repl" : ""} - - ## Remove samples with chars out of list (letters, comma, period, question mark, space) - - _target_: sdp.processors.DropIfNoneOfRegexMatch - output_manifest_file: ${workspace_dir}/manifest19.json - text_key: post_normalized - regex_patterns: - - "^[a-zA-ZäÄöÖüÜß,\\.?\\s]+$" - - # Create text field with lowercased clean "post_normalized" - - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest20.json - duplicate_fields: {"post_normalized" : "text"} - - - _target_: sdp.processors.SubMakeLowercase - output_manifest_file: ${workspace_dir}/manifest21.json - text_key: "text" - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest22.json - text_key: "text" - regex_params_list: - - {"pattern": "[\\.\\?\\,]", "repl" : " "} - - {"pattern": "\\s+", "repl" : " "} - - {"pattern": "^\\s+", "repl" : ""} - - {"pattern": "\\s+$", "repl" : ""} - - # Create pred_text field with lowercased clean "pred_text_pc" - - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest23.json - duplicate_fields: {"pred_text_pc" : "pred_text"} - - - _target_: sdp.processors.SubMakeLowercase - output_manifest_file: ${workspace_dir}/manifest24.json - text_key: "pred_text" - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest25.json - text_key: "pred_text" - regex_params_list: - - {"pattern": "[\\.\\?\\,]", "repl" : " "} - - {"pattern": "\\s+", "repl" : " "} - - {"pattern": "^\\s+", "repl" : ""} - - {"pattern": "\\s+$", "repl" : ""} - - # Filtration - - _target_: sdp.processors.DropHighCER - output_manifest_file: ${workspace_dir}/manifest26.json - cer_threshold: ${max_cer} - text_key: "text" - pred_text_key: "pred_text" - - - _target_: sdp.processors.DropHighWER - output_manifest_file: ${workspace_dir}/manifest27.json - wer_threshold: ${max_wer} - text_key: "text" - pred_text_key: "pred_text" - - # Finalization - - _target_: sdp.processors.KeepOnlySpecifiedFields - output_manifest_file: ${workspace_dir}/manifest28.json - fields_to_keep: ["audio_filepath", "duration", "post_normalized"] - - - _target_: sdp.processors.RenameFields - output_manifest_file: ${workspace_dir}/manifest29.json - rename_fields: {"post_normalized":"text"} - - - _target_: sdp.processors.datasets.commoncrawl.CopyFiles - file_field: audio_filepath - path_to_copy: ${workspace_dir}/clean_data/audio/ - path_levels: 1 - - - _target_: sdp.processors.datasets.commoncrawl.DropAbsPath - output_manifest_file: ${workspace_dir}/clean_data/${lang}_manifest.json - path_key: audio_filepath - abs_path_to_drop: ${workspace_dir} - - From f02f37af9d68545e9db2fdaaeef22e6b3be7e96a Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Sun, 24 Nov 2024 03:42:35 -0800 Subject: [PATCH 112/115] rm sdp/processors/datasets/yt Signed-off-by: Nikolay Karpov --- sdp/processors/datasets/youtube/__init__.py | 18 --- .../datasets/youtube/aggregate_segments.py | 97 ------------- .../youtube/create_initial_manifest.py | 90 ------------- .../datasets/youtube/merge_manifests.py | 35 ----- .../datasets/youtube/requirements.txt | 2 - sdp/processors/datasets/youtube/utils.py | 127 ------------------ 6 files changed, 369 deletions(-) delete mode 100644 sdp/processors/datasets/youtube/__init__.py delete mode 100644 sdp/processors/datasets/youtube/aggregate_segments.py delete mode 100644 sdp/processors/datasets/youtube/create_initial_manifest.py delete mode 100644 sdp/processors/datasets/youtube/merge_manifests.py delete mode 100644 sdp/processors/datasets/youtube/requirements.txt delete mode 100644 sdp/processors/datasets/youtube/utils.py diff --git a/sdp/processors/datasets/youtube/__init__.py b/sdp/processors/datasets/youtube/__init__.py deleted file mode 100644 index 119ac1ca..00000000 --- a/sdp/processors/datasets/youtube/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from .create_initial_manifest import CreateInitialManifest -from .utils import parse_srt -from .aggregate_segments import * -from .merge_manifests import MergeManifests \ No newline at end of file diff --git a/sdp/processors/datasets/youtube/aggregate_segments.py b/sdp/processors/datasets/youtube/aggregate_segments.py deleted file mode 100644 index 64927091..00000000 --- a/sdp/processors/datasets/youtube/aggregate_segments.py +++ /dev/null @@ -1,97 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os - -from pydub import AudioSegment - -from sdp.processors.base_processor import BaseParallelProcessor -from sdp.processors.datasets.youtube.utils import ( - AggregatedSegment, - RawSegment, - get_audio_segment, -) - - -class AggregateSegments(BaseParallelProcessor): - def __init__( - self, - source_audio_key: str = "audio_filepath", - splited_audio_key: str = "audio_filepath", - max_duration: float = 40.0, - crop_audio_segments: bool = True, - output_segments_audio_dir: str = None, - **kwargs, - ): - super().__init__(**kwargs) - self.max_duration = max_duration - self.source_audio_key = source_audio_key - self.splited_audio_key = splited_audio_key - self.crop_audio_segments = crop_audio_segments - self.output_segments_audio_dir = output_segments_audio_dir - - def prepare(self): - if self.crop_audio_segments and self.output_segments_audio_dir: - os.makedirs(os.path.join(self.output_segments_audio_dir), exist_ok=True) - - def process_dataset_entry(self, data_entry: dict): - sample_id = data_entry['sample_id'] - segments = data_entry['segments'] - agg_segments = [] - - if len(segments) == 0: - return agg_segments - - first_segment = RawSegment(**segments[0]) - agg_segment = AggregatedSegment( - segment=first_segment, - segment_id=1, - sample_id=sample_id, - output_audio_dir=self.output_segments_audio_dir, - audio_lang=data_entry['audio_lang'], - text_lang=data_entry['text_lang'], - source_audio=data_entry[self.source_audio_key], - ) - - for segment in segments[1:]: - segment = RawSegment(**segment) - - if ( - not agg_segment.duration_match - or agg_segment.duration >= self.max_duration - or segment.end_time - agg_segment.start_time >= self.max_duration - ): - agg_segments.append(agg_segment.to_dataentry()) - agg_segment = AggregatedSegment( - segment=segment, - segment_id=len(agg_segments) + 1, - sample_id=sample_id, - audio_lang=data_entry['audio_lang'], - text_lang=data_entry['text_lang'], - source_audio=data_entry[self.source_audio_key], - output_audio_dir=self.output_segments_audio_dir, - ) - else: - agg_segment.aggregate(segment) - else: - agg_segments.append(agg_segment.to_dataentry()) - - if self.crop_audio_segments: - audio = AudioSegment.from_wav(data_entry[self.source_audio_key]) - for agg_segment in agg_segments: - get_audio_segment( - audio=audio, - start_time=agg_segment.data['start_time'], - end_time=agg_segment.data['end_time'], - output_audio_filepath=agg_segment.data[self.splited_audio_key], - ) - - return agg_segments diff --git a/sdp/processors/datasets/youtube/create_initial_manifest.py b/sdp/processors/datasets/youtube/create_initial_manifest.py deleted file mode 100644 index 3bca6ee1..00000000 --- a/sdp/processors/datasets/youtube/create_initial_manifest.py +++ /dev/null @@ -1,90 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -from typing import Dict -from glob import glob - -from sdp.logging import logger -from sdp.processors.base_processor import BaseParallelProcessor, DataEntry -from sdp.processors.datasets.youtube.utils import parse_srt, Sample -from sdp.utils.common import ffmpeg_convert - -class CreateInitialManifest(BaseParallelProcessor): - def __init__( - self, - data_dir: str, - output_audio_dir: str, - audio_file_extenstion: str = ".opus", - target_samplerate: int = 16000, - target_nchannels: int = 1, - **kwargs, - ): - super().__init__(**kwargs) - self.data_dir = data_dir - self.output_audio_dir = output_audio_dir - self.audio_file_extenstion = audio_file_extenstion - self.target_samplerate = target_samplerate - self.target_nchannels = target_nchannels - - def _get_manifest(self): - audio_filepaths = glob(f"{self.data_dir}/*{self.audio_file_extenstion}") - samples = [] - for audio_filepath in audio_filepaths: - sample = Sample(orig_audio_filepath = audio_filepath) - sample.sample_id = os.path.basename(audio_filepath).replace(self.audio_file_extenstion, "") # Get sample_id - - # Get .srt file, which relaterd to source audio - srt_filepaths = glob(f"{self.data_dir}/{sample.sample_id}.*.srt") - - if len(srt_filepaths) < 1: - logger.warning(f"Sample \"{sample.sample_id}\" has no related .srt files. Skipping") - continue - - srt_filepath = srt_filepaths[0] - if len(srt_filepaths) > 1: - logger.warning(f"Sample \"{sample.sample_id}\" has multiple related .srt files: {', '.join(srt_filepaths)}. \ - Only first file will be used for parsing - {srt_filepaths[0]}, other related .srt files will be skipped.") - - sample.srt_filepath = srt_filepath - samples.append(sample.to_dataentry()) - - return samples - - def prepare(self): - os.makedirs(os.path.join(self.output_audio_dir), exist_ok=True) - - def read_manifest(self): - data_entries = self._get_manifest() - return data_entries - - def process_dataset_entry(self, data_entry: DataEntry): - # Convert source_audio_filepath to .wav - data_entry.data['audio_filepath'] = os.path.join(self.output_audio_dir, os.path.basename(data_entry.data['orig_audio_filepath']).replace(self.audio_file_extenstion, ".wav")) - - ffmpeg_convert(input_file=data_entry.data['orig_audio_filepath'], - output_wav=data_entry.data['audio_filepath'], - sample_rate=self.target_samplerate, - num_channels=self.target_nchannels) - - if not os.path.exists(data_entry.data['audio_filepath']): - return [] - - # Parse segments from .srt - segments = parse_srt(data_entry.data['srt_filepath'], verify_duration = True, wav_filepath=data_entry.data['audio_filepath']) - - if len(segments) > 0: - data_entry.data['segments'] = [segment.__dict__ for segment in segments] - - return [data_entry] \ No newline at end of file diff --git a/sdp/processors/datasets/youtube/merge_manifests.py b/sdp/processors/datasets/youtube/merge_manifests.py deleted file mode 100644 index 0860c429..00000000 --- a/sdp/processors/datasets/youtube/merge_manifests.py +++ /dev/null @@ -1,35 +0,0 @@ -from sdp.processors.base_processor import BaseParallelProcessor, DataEntry -import json - -class MergeManifests(BaseParallelProcessor): - def __init__( - self, input_manifest_file2: str, fields_to_merge: dict, key_field: str = "audio_filepath", - **kwargs - ): - super().__init__(**kwargs) - self.input_manifest_file2 = input_manifest_file2 - self.manifest2_dict = {} - self.fields_to_merge = fields_to_merge - self.key_field = key_field - - def prepare(self): - with open(self.input_manifest_file2, 'r') as manifest: - line = manifest.readline() - while line: - whole_sample = json.loads(line) - key_value = whole_sample[self.key_field] - sample = {} - for field_names_dict in self.fields_to_merge: - curr_field_name = list(field_names_dict.keys())[0] - sample[curr_field_name] = whole_sample[curr_field_name] - - self.manifest2_dict[key_value] = sample - line = manifest.readline() - - def process_dataset_entry(self, data_entry: dict): - key_value = data_entry[self.key_field] - for field_names_dict in self.fields_to_merge: - curr_field_name = list(field_names_dict.keys())[0] - new_field_name = field_names_dict[curr_field_name] - data_entry[new_field_name] = self.manifest2_dict[key_value][curr_field_name] - return [DataEntry(data=data_entry)] \ No newline at end of file diff --git a/sdp/processors/datasets/youtube/requirements.txt b/sdp/processors/datasets/youtube/requirements.txt deleted file mode 100644 index 6f677747..00000000 --- a/sdp/processors/datasets/youtube/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -pysrt -webvtt-py \ No newline at end of file diff --git a/sdp/processors/datasets/youtube/utils.py b/sdp/processors/datasets/youtube/utils.py deleted file mode 100644 index 48483221..00000000 --- a/sdp/processors/datasets/youtube/utils.py +++ /dev/null @@ -1,127 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import re -from dataclasses import dataclass - -import pysrt -from pydub import AudioSegment - -from sdp.processors.base_processor import DataEntry - - -@dataclass -class RawSegment: - segment_id: int = None - start_time: float = None - end_time: float = None - duration: str = None - duration_match: bool = None - orig_text: str = None - audio_lang: str = None - text_lang: str = None - source_audio: str = None - - def to_dataentry(self): - return DataEntry(data=self.__dict__) - - -class AggregatedSegment(RawSegment): - def __init__( - self, - segment: dict, - segment_id: int, - sample_id: str, - output_audio_dir: str, - audio_lang: str, - text_lang: str, - source_audio: str, - ): - super().__init__(**segment.__dict__) - self.segment_id = f"{sample_id}_{str(segment_id).zfill(4)}" - self.audio_lang = audio_lang - self.text_lang = text_lang - self.source_audio = source_audio - self.audio_filepath = ( - os.path.join(output_audio_dir, f'{self.segment_id}.wav') if output_audio_dir is not None else None - ) - - def aggregate(self, segment): - self.end_time = segment.end_time - self.duration = self.end_time - self.start_time - self.orig_text = re.sub("\s+", " ", f"{self.orig_text} {segment.orig_text}".strip()) - - -@dataclass -class Sample: - sample_id: str = None - srt_filepath: str = None - orig_audio_filepath: str = None - audio_filepath: str = None - segments: list[RawSegment | AggregatedSegment] = None - - def to_dataentry(self): - data = self.__dict__ - data['segments'] = ( - [segment.data.__dict__ for segment in data['segments']] if data['segments'] is not None else [] - ) - return DataEntry(data=data) - - -def get_audio_segment(audio, start_time: float, end_time: float, output_audio_filepath: str = None): - start_time = start_time * 1000 - end_time = end_time * 1000 - audio_segment = audio[start_time:end_time] - - if output_audio_filepath: - audio_segment.export(output_audio_filepath, format="wav") - return audio_segment - - -def get_audio_segment_duration(audio, start_time, end_time): - audio_segment = get_audio_segment(audio, start_time, end_time) - return audio_segment.duration_seconds - - -def parse_srt(srt_filepath, verify_duration: bool = True, wav_filepath: str = None): - subs = pysrt.open(srt_filepath) - srt_segments = [] - - if verify_duration and wav_filepath: - audio = AudioSegment.from_wav(wav_filepath) - else: - audio = None - - epsilon = 1e-2 - - for sub in subs: - segment = RawSegment( - segment_id=sub.index, - start_time=sub.start.ordinal / 1000, - end_time=sub.end.ordinal / 1000, - orig_text=sub.text_without_tags, - ) - - duration_by_timestemps = segment.end_time - segment.start_time - - if audio: - segment.duration = get_audio_segment_duration(audio, segment.start_time, segment.end_time) - segment.duration_match = abs(segment.duration - duration_by_timestemps) < epsilon - else: - segment.duration = duration_by_timestemps - - srt_segments.append(segment) - - return srt_segments From 9d492e9704c40707e3d77863db33fb7d3593380d Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Sun, 24 Nov 2024 03:43:48 -0800 Subject: [PATCH 113/115] whitespace Signed-off-by: Nikolay Karpov --- dataset_configs/armenian/text_mcv/config.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/dataset_configs/armenian/text_mcv/config.yaml b/dataset_configs/armenian/text_mcv/config.yaml index 7cbd4bb4..57d9de37 100644 --- a/dataset_configs/armenian/text_mcv/config.yaml +++ b/dataset_configs/armenian/text_mcv/config.yaml @@ -32,6 +32,7 @@ documentation: | * **workspace_dir**: specify the workspace folder where all audio files will be stored. Note that you can customize any part of this config either directly or from command-line. + Here are some common customizations to consider: **Output format**. From 82f58e00e54985eb92c966a1c7f15aa4419acdfa Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Sun, 24 Nov 2024 03:59:48 -0800 Subject: [PATCH 114/115] rm llm Signed-off-by: Nikolay Karpov --- dataset_configs/commoncrawl/pnc.yaml | 85 ------ dataset_configs/commoncrawl/small.yaml | 82 ------ dataset_configs/commoncrawl/small_de.yaml | 136 ---------- dataset_configs/commoncrawl/small_de_en.yaml | 128 --------- dataset_configs/commoncrawl/small_en.yaml | 246 ------------------ dataset_configs/commoncrawl/small_es.yaml | 160 ------------ dataset_configs/commoncrawl/small_fr.yaml | 120 --------- dataset_configs/commoncrawl/small_pl.yaml | 119 --------- .../commoncrawl/small_sentence.yaml | 119 --------- sdp/processors/huggingface/llm.py | 217 --------------- 10 files changed, 1412 deletions(-) delete mode 100644 dataset_configs/commoncrawl/pnc.yaml delete mode 100644 dataset_configs/commoncrawl/small.yaml delete mode 100644 dataset_configs/commoncrawl/small_de.yaml delete mode 100644 dataset_configs/commoncrawl/small_de_en.yaml delete mode 100644 dataset_configs/commoncrawl/small_en.yaml delete mode 100644 dataset_configs/commoncrawl/small_es.yaml delete mode 100644 dataset_configs/commoncrawl/small_fr.yaml delete mode 100644 dataset_configs/commoncrawl/small_pl.yaml delete mode 100644 dataset_configs/commoncrawl/small_sentence.yaml delete mode 100644 sdp/processors/huggingface/llm.py diff --git a/dataset_configs/commoncrawl/pnc.yaml b/dataset_configs/commoncrawl/pnc.yaml deleted file mode 100644 index 72174eb6..00000000 --- a/dataset_configs/commoncrawl/pnc.yaml +++ /dev/null @@ -1,85 +0,0 @@ -processors_to_run: "0:" - -WINDOW: 8000 -OFFSET: 0 -THRESHOLD: -5 -MAX_DURATION: 40 -MAX_SILENCE: 1.0 # 1.5 - -MODEL: "stt_en_citrinet_512_gamma_0_25" -NEMO_DIR_PATH: /home/nkarpov/workspace/NeMo_main -TOOLS_DIR: ${NEMO_DIR_PATH}/tools/ctc_segmentation/scripts -DATA_DIR: /mnt/ssd8/multilang/en/val_test/mls/test -workspace_dir: ${DATA_DIR}/manifests - - -processors: - - _target_: sdp.processors.DuplicateFields - input_manifest_file: /mnt/ssd8/multilang/en/val_test/mls/test/mls_test_pc_head.json - duplicate_fields: {"text": "text_pc"} - - - _target_: sdp.processors.SubMakeLowercase - text_key: "text" - - - _target_: sdp.processors.SubRegex - text_key: text - regex_params_list: - - {"pattern": "[\\?\\.]", "repl": " "} - - {"pattern": ",", "repl": " "} - - {"pattern": "\\s+", "repl": " "} - - - _target_: sdp.processors.KeepOnlySpecifiedFields - output_manifest_file: ${workspace_dir}/mls_test_example.json - fields_to_keep: ["text", "text_pc", "audio_filepath", "duration"] - - # 4 - - _target_: sdp.processors.huggingface.llm.ApplyLlama3 # pip install num2words huggingface_hub; huggingface-cli; login hf_... - input_manifest_file: /mnt/ssd8/multilang/en/val_test/mls/test/mls_test_nopc.json - input_example_manifest: ${workspace_dir}/mls_test_example.json - example_query_key: "text" - example_response_key: "text_pc" - pretrained_model: "meta-llama/Meta-Llama-3-8B-Instruct" - input_text_key: "text" - main_promt: [ - "Your task is to punctuate the text.", - "You must not change the words in the text.", - "Just add punctuations.", - "You can only use a period, comma or question mark as punctuation.", - "Add capitalization to the beginning of the sentence if necessary.", - "Do not use too long sentences, try to insert period mark.", - "Do not reduce the number of input words", - "Do not add your own comments in the beggining of the answer" - ] - torch_dtype: "float16" - output_text_key: "text_pc" - output_manifest_file: ${workspace_dir}/manifest_pc.json - # 5 - - _target_: sdp.processors.huggingface.llm.WriteTxtFiles - text_key: text_pc - audio_key: audio_filepath - output_dir: ${DATA_DIR}/text - - - _target_: sdp.processors.huggingface.llm.Subprocess - cmd: "python ${TOOLS_DIR}/prepare_data.py \ - --in_text=${DATA_DIR}/text \ - --output_dir=${DATA_DIR}/processed/ \ - --language=en \ - --model=${MODEL} \ - --additional_split_symbols='.' \ - --audio_dir=${DATA_DIR}/wav" - - - _target_: sdp.processors.huggingface.llm.Subprocess - cmd: "python ${TOOLS_DIR}/run_ctc_segmentation.py \ - --output_dir=${DATA_DIR}/output \ - --data=${DATA_DIR}/for_ctc_segmentation \ - --model=${MODEL} \ - --window_len=${WINDOW}" - - - _target_: sdp.processors.huggingface.llm.Subprocess - cmd: "python ${TOOLS_DIR}/cut_audio_with_combain_segments.py \ - --output_dir=${DATA_DIR}/output \ - --alignment=${DATA_DIR}/output/segments/ \ - --threshold=${THRESHOLD} \ - --max_duration=${MAX_DURATION} \ - --offset=${OFFSET} \ - --max_silence=${MAX_SILENCE}" \ No newline at end of file diff --git a/dataset_configs/commoncrawl/small.yaml b/dataset_configs/commoncrawl/small.yaml deleted file mode 100644 index be90de1b..00000000 --- a/dataset_configs/commoncrawl/small.yaml +++ /dev/null @@ -1,82 +0,0 @@ -processors_to_run: "0:" -workspace_dir: /mnt/ssd8/cc_sdp -final_manifest: ${workspace_dir}/full_manifest.json -group_duration_threshold: 20.0 - -processors: - - _target_: sdp.processors.datasets.commoncrawl.CreateInitialManifestCC - raw_data_dir: /mnt/ssd8/cc_download/videos_and_transcripts_large_16_cores_output/video_output2 - output_manifest_file: ${workspace_dir}/manifest0.json - resampled_audio_dir: ${workspace_dir}/audio/ - target_samplerate: 16000 - target_nchannels: 1 - audio_field: "audios" - video_field: "videos" - key_field: "key" - text_field: "texts" - - - _target_: sdp.processors.datasets.commoncrawl.TxtToVtt - input_manifest_file: ${workspace_dir}/manifest0.json - output_manifest_file: ${workspace_dir}/manifest1.json - vtt_files_dir: ${workspace_dir}/vtts/ - key_field: "key" - text_field: "texts" - vtt_field: "vtt_filepath" - - - _target_: sdp.processors.datasets.commoncrawl.AllVttText - input_manifest_file: ${workspace_dir}/manifest1.json - output_manifest_file: ${workspace_dir}/manifest2.json - input_filepath_field: vtt_filepath - output_text_field: vtt_text - - - _target_: sdp.processors.datasets.commoncrawl.TextLid - input_manifest_file: ${workspace_dir}/manifest2.json - output_manifest_file: ${workspace_dir}/manifest3.json - input_text_field: vtt_text - output_lang_field: text_lang - device: cuda - pretrained_model: "jb2k/bert-base-multilingual-cased-language-detection" - drop_text_duplicates: True - - - _target_: sdp.processors.datasets.commoncrawl.Lang2Iso - input_manifest_file: ${workspace_dir}/manifest3.json - output_manifest_file: ${workspace_dir}/manifest4.json - input_lang_field: text_lang - output_lang_field: text_lang - - - _target_: sdp.processors.datasets.commoncrawl.AudioLid - input_manifest_file: ${workspace_dir}/manifest4.json - output_manifest_file: ${workspace_dir}/manifest5.json - input_audio_field: audios - output_lang_field: audio_lang - device: cuda - pretrained_model: "langid_ambernet" - - - _target_: sdp.processors.datasets.commoncrawl.SplitByVtt - input_manifest_file: ${workspace_dir}/manifest5.json - output_manifest_file: ${workspace_dir}/manifest6.json - splited_audio_dir: ${workspace_dir}/splited/ - source_audio_field: audios - audio_lang_field: audio_lang - text_lang_field: text_lang - key_field: "key" - target_audio_field: "audio_filepath" - duration_field: "durations" - text_field: "text" - vtt_field: "vtt_filepath" - - - _target_: sdp.processors.RenameFields - input_manifest_file: ${workspace_dir}/manifest6.json - rename_fields: {"durations": duration} - - - _target_: sdp.processors.DropHighLowDuration - output_manifest_file: ${workspace_dir}/manifest7.json - high_duration_threshold: 40 - low_duration_threshold: 0.2 - - - _target_: sdp.processors.datasets.commoncrawl.ReadParquet - raw_data_dir: /mnt/ssd8/cc_download/videos_and_transcripts_large_16_cores_output/video_output2 - output_manifest_file: ${workspace_dir}/manifest8.json - output_video_field: video - output_caption_field: caption - key_field: key \ No newline at end of file diff --git a/dataset_configs/commoncrawl/small_de.yaml b/dataset_configs/commoncrawl/small_de.yaml deleted file mode 100644 index cd127fc1..00000000 --- a/dataset_configs/commoncrawl/small_de.yaml +++ /dev/null @@ -1,136 +0,0 @@ -processors_to_run: "3:" -workspace_dir: /mnt/ssd8/cc_sdp/de # ü ä ö ß Ä Ö Ü - -processors: - - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue - input_manifest_file: /mnt/ssd8/cc_sdp/manifest8ps.json - output_manifest_file: ${workspace_dir}/manifest0.json - input_field: audio_lang - target_value: de - - - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue - output_manifest_file: ${workspace_dir}/manifest1.json - input_field: text_lang - target_value: de - - - _target_: sdp.processors.ASRInference - output_manifest_file: ${workspace_dir}/manifest2.json - pretrained_model: nvidia/stt_de_fastconformer_hybrid_large_pc - batch_size: 64 - - - _target_: sdp.processors.DuplicateFields - input_manifest_file: ${workspace_dir}/manifest2.json - output_manifest_file: ${workspace_dir}/manifest3.json - duplicate_fields: {"text":"orig_text"} - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest4.json - regex_params_list: - - {"pattern": '\[(.*?)\]', "repl": ' '} - - {"pattern": 'î', "repl": "i"} - - {"pattern": 'ì', "repl": "i"} - - {"pattern": 'í', "repl": "i"} - - {"pattern": '‚', "repl": ","} - - {"pattern": "’", "repl": "'"} - - {"pattern": "[-–—]", "repl": " "} - - {"pattern": '―', "repl": "-"} - - {"pattern": '—', "repl": "-"} - - {"pattern": '⁺', "repl": "+"} - - {"pattern": '“', "repl": '"'} - - {"pattern": '”', "repl": '"'} - - {"pattern": '…', "repl": '.'} - - {"pattern": '‘', "repl": "'"} - - {"pattern": '′', "repl": "'"} - - {"pattern": '`', "repl": "'"} - - {"pattern": '⁻', "repl": "-"} - - {"pattern": '‑', "repl": "-"} - - {"pattern": '¶', "repl": ' '} - - {"pattern": '«', "repl": '"'} - - {"pattern": '»', "repl": '"'} - - {"pattern": '„', "repl": '"'} - - {"pattern": '®', "repl": ' '} - - {"pattern": " ", "repl": " "} - - - _target_: sdp.processors.DropHighLowWordrate - output_manifest_file: ${workspace_dir}/manifest5.json - high_wordrate_threshold: 100 - low_wordrate_threshold: 0.01 - - - _target_: sdp.processors.datasets.commoncrawl.Subprocess - output_manifest_file: ${workspace_dir}/manifest6.json - input_manifest_arg: "--manifest" - output_manifest_arg: "--output_filename" - arg_separator: "=" - cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \ - --language=de --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache \ - --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/de/data/whitelist.tsv" - - - _target_: sdp.processors.RenameFields - output_manifest_file: ${workspace_dir}/manifest7.json - rename_fields: {"normalized":"text"} - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest8.json - text_key: text - regex_params_list: - - {"pattern": '\\.{3}', "repl": '.'} - - {"pattern": '\$', "repl": ""} - - {"pattern": "'", "repl": " "} - - {"pattern": "[^a-zA-ZäöüÄÖÜß.,?]", "repl": " "} - - {"pattern": ' ', "repl": " "} - - - _target_: sdp.processors.DropHighLowWordrate - output_manifest_file: ${workspace_dir}/manifest9.json - high_wordrate_threshold: 100 - low_wordrate_threshold: 0.01 - - - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest10.json - duplicate_fields: {"text":"text_pc"} - - - _target_: sdp.processors.SubMakeLowercase - output_manifest_file: ${workspace_dir}/manifest11.json - text_key: text - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest12.json - text_key: text - regex_params_list: - - {"pattern": "[\\?\\.\\!]", "repl": " "} - - {"pattern": ",", "repl": " "} - - {"pattern": " ", "repl": " "} - - - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest13.json - duplicate_fields: {"pred_text":"pred_text_pc"} - - - _target_: sdp.processors.SubMakeLowercase - output_manifest_file: ${workspace_dir}/manifest14.json - text_key: pred_text - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest15.json - text_key: pred_text - regex_params_list: - - {"pattern": "[\\?\\.\\!]", "repl": " "} - - {"pattern": ",", "repl": " "} - - {"pattern": " ", "repl": " "} - - - _target_: sdp.processors.DropIfRegexMatch - output_manifest_file: ${workspace_dir}/manifest16.json - text_key: text - regex_patterns: - - "^\\s*$" - - - _target_: sdp.processors.DropHighWER - output_manifest_file: ${workspace_dir}/manifest17.json - text_key: text - pred_text_key: pred_text - wer_threshold: 75 - - - _target_: sdp.processors.DropHighCER - output_manifest_file: ${workspace_dir}/manifest18.json - text_key: text - pred_text_key: pred_text - cer_threshold: 30 - \ No newline at end of file diff --git a/dataset_configs/commoncrawl/small_de_en.yaml b/dataset_configs/commoncrawl/small_de_en.yaml deleted file mode 100644 index f6f6dd7a..00000000 --- a/dataset_configs/commoncrawl/small_de_en.yaml +++ /dev/null @@ -1,128 +0,0 @@ -processors_to_run: "9" -workspace_dir: /mnt/ssd8/cc_sdp/de_en - -processors: - - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue - input_manifest_file: /mnt/ssd8/cc_sdp/manifest8ps.json - output_manifest_file: ${workspace_dir}/manifest0.json - input_field: audio_lang - target_value: de - - - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue - output_manifest_file: ${workspace_dir}/manifest1.json - input_field: text_lang - target_value: en - - - _target_: sdp.processors.ASRInference - output_manifest_file: ${workspace_dir}/manifest2.json - pretrained_model: nvidia/stt_de_fastconformer_hybrid_large_pc - batch_size: 64 - - - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest3.json - duplicate_fields: {"text":"orig_text"} - - - _target_: sdp.processors.RenameFields - output_manifest_file: ${workspace_dir}/manifest4.json - rename_fields: {"pred_text": "asr_text"} - - - _target_: sdp.processors.DropIfRegexMatch - output_manifest_file: ${workspace_dir}/manifest5.json - text_key: asr_text - regex_patterns: - - "^\\s*$" - - - _target_: sdp.processors.datasets.commoncrawl.NmtSubprocess - output_manifest_file: ${workspace_dir}/manifest6.json - arg_separator: "=" - srctext_file: ${workspace_dir}/srctext.txt # --srctext=${workspace_dir}/srctext.txt - tgtout_file: ${workspace_dir}/tgtout.txt # --tgtout=${workspace_dir}/tgtout.txt - input_field: "asr_text" - output_field: "pred_text" - cmd: "python /home/nkarpov/workspace/NeMo/examples/nlp/machine_translation/nmt_transformer_infer.py \ - --model=${workspace_dir}/nmt_de_en_transformer12x2.nemo --target_lang=en --source_lang=de" - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest7.json - regex_params_list: - - {"pattern": '+', "repl": ' '} - - {"pattern": 'î', "repl": "i"} - - {"pattern": 'ì', "repl": "i"} - - {"pattern": 'í', "repl": "i"} - - {"pattern": '‚', "repl": ","} - - {"pattern": "’", "repl": "'"} - - {"pattern": "[-–—]", "repl": " "} - - {"pattern": '―', "repl": "-"} - - {"pattern": '—', "repl": "-"} - - {"pattern": '⁺', "repl": "+"} - - {"pattern": '“', "repl": '"'} - - {"pattern": '”', "repl": '"'} - - {"pattern": '…', "repl": '.'} - - {"pattern": '‘', "repl": "'"} - - {"pattern": '′', "repl": "'"} - - {"pattern": '`', "repl": "'"} - - {"pattern": '⁻', "repl": "-"} - - {"pattern": '‑', "repl": "-"} - - {"pattern": '¶', "repl": ' '} - - {"pattern": '«', "repl": '"'} - - {"pattern": '»', "repl": '"'} - - {"pattern": '„', "repl": '"'} - - {"pattern": '®', "repl": ' '} - - {"pattern": " ", "repl": " "} - - - _target_: sdp.processors.DropHighLowWordrate - output_manifest_file: ${workspace_dir}/manifest8.json - high_wordrate_threshold: 100 - low_wordrate_threshold: 0.01 - - - _target_: sdp.processors.datasets.commoncrawl.Subprocess - output_manifest_file: ${workspace_dir}/manifest9.json - input_manifest_arg: "--manifest" - output_manifest_arg: "--output_filename" - arg_separator: "=" - cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \ - --language=en --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache \ - --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv" # --overwrite_cache - - - _target_: sdp.processors.RenameFields - output_manifest_file: ${workspace_dir}/manifest10.json - rename_fields: {"normalized":"text"} - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest11.json - text_key: text - regex_params_list: - - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"} - - {"pattern": "^\\s*'*\\s*", "repl": ""} - - {"pattern": "'{2,}", "repl": "'"} - - {"pattern": '\\.{3}', "repl": '.'} - - {"pattern": '\$', "repl": ""} - - {"pattern": "[^A-Za-z'.,?]", "repl": " "} - - {"pattern": ' ', "repl": " "} - test_cases: - - {input: {text: "' jupiter and venus both shining in the golden rosy sky"}, output: {text: "jupiter and venus both shining in the golden rosy sky"}} - - {input: {text: "' may all the gold i have ever dreamed of be yours '"}, output: {text: "may all the gold i have ever dreamed of be yours"}} - - {input: {text: "''cause it''s an adult novel versus ya"}, output: {text: "cause it's an adult novel versus ya"}} - - - _target_: sdp.processors.datasets.commoncrawl.BLEUScore - output_manifest_file: ${workspace_dir}/manifest12.json - ref_field: text - hyp_field: pred_text - output_field: bleu - - - _target_: sdp.processors.datasets.commoncrawl.UseSonar - output_manifest_file: ${workspace_dir}/manifest13.json - input_text_field: text - input_audio_field: audio_filepath - output_field: sonar_dist - device: cuda - batch_size: 64 - speech_encoder_model: sonar_speech_encoder_deu - text_encoder_model: text_sonar_basic_encoder - text_encoder_lang: eng_Latn - - - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue - output_manifest_file: ${workspace_dir}/manifest14.json - input_field: bleu - target_value: 10 - operator: ge diff --git a/dataset_configs/commoncrawl/small_en.yaml b/dataset_configs/commoncrawl/small_en.yaml deleted file mode 100644 index 289bff7b..00000000 --- a/dataset_configs/commoncrawl/small_en.yaml +++ /dev/null @@ -1,246 +0,0 @@ -processors_to_run: "0:" -workspace_dir: /mnt/ssd8/cc_sdp/en - -processors: - - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue - input_manifest_file: /mnt/ssd8/cc_sdp/manifest9a.json - output_manifest_file: ${workspace_dir}/manifest0.json - input_field: audio_lang - target_value: en - - - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue - output_manifest_file: ${workspace_dir}/manifest1.json - input_field: text_lang - target_value: en - - - _target_: sdp.processors.ASRInference - output_manifest_file: ${workspace_dir}/manifest2.json - pretrained_model: nvidia/stt_en_fastconformer_hybrid_large_pc - batch_size: 64 - - - _target_: sdp.processors.DuplicateFields - input_manifest_file: ${workspace_dir}/manifest2.json - output_manifest_file: ${workspace_dir}/manifest3.json - duplicate_fields: {"text":"orig_text"} - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest4.json - regex_params_list: - - {"pattern": '\[(.*?)\]', "repl": ' '} - - {"pattern": 'î', "repl": "i"} - - {"pattern": 'ì', "repl": "i"} - - {"pattern": 'í', "repl": "i"} - - {"pattern": '¡', "repl": "i"} - - {"pattern": '‚', "repl": ","} - - {"pattern": "’", "repl": "'"} - - {"pattern": "[-–—]", "repl": " "} - - {"pattern": '―', "repl": "-"} - - {"pattern": '—', "repl": "-"} - - {"pattern": '⁺', "repl": "+"} - - {"pattern": '“', "repl": '"'} - - {"pattern": '”', "repl": '"'} - - {"pattern": '…', "repl": '.'} - - {"pattern": '‘', "repl": "'"} - - {"pattern": '′', "repl": "'"} - - {"pattern": '`', "repl": "'"} - - {"pattern": '⁻', "repl": "-"} - - {"pattern": '‑', "repl": "-"} - - {"pattern": '¶', "repl": ' '} - - {"pattern": '«', "repl": '"'} - - {"pattern": '»', "repl": '"'} - - {"pattern": '„', "repl": '"'} - - {"pattern": '®', "repl": ' '} - - {"pattern": "%", "repl": ' '} - - {"pattern": '@', "repl": " "} - - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"} - - {"pattern": " ", "repl": " "} - - - _target_: sdp.processors.DropHighLowWordrate - output_manifest_file: ${workspace_dir}/manifest5.json - high_wordrate_threshold: 100 - low_wordrate_threshold: 0.01 - - - - _target_: sdp.processors.datasets.commoncrawl.Subprocess - output_manifest_file: ${workspace_dir}/manifest6.json - input_manifest_arg: "--manifest" - output_manifest_arg: "--output_filename" - arg_separator: "=" - cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \ - --language=en --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache \ - --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv" - - - _target_: sdp.processors.RenameFields - output_manifest_file: ${workspace_dir}/manifest7.json - rename_fields: {"normalized":"text"} - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest8.json - text_key: text - regex_params_list: - - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"} - - {"pattern": "^\\s*'*\\s*", "repl": ""} - - {"pattern": "'{2,}", "repl": "'"} - - {"pattern": '\\.{3}', "repl": '.'} - - {"pattern": '\$', "repl": ""} - - {"pattern": "[^A-Za-z'.,?]", "repl": " "} - - {"pattern": ' ', "repl": " "} - test_cases: - - {input: {text: "' jupiter and venus both shining in the golden rosy sky"}, output: {text: "jupiter and venus both shining in the golden rosy sky"}} - - {input: {text: "' may all the gold i have ever dreamed of be yours '"}, output: {text: "may all the gold i have ever dreamed of be yours"}} - - {input: {text: "''cause it''s an adult novel versus ya"}, output: {text: "cause it's an adult novel versus ya"}} - - - _target_: sdp.processors.DropHighLowWordrate - output_manifest_file: ${workspace_dir}/manifest9.json - high_wordrate_threshold: 100 - low_wordrate_threshold: 0.01 - - - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest10.json - duplicate_fields: {"text":"text_pc"} - - - _target_: sdp.processors.SubMakeLowercase - output_manifest_file: ${workspace_dir}/manifest11.json - text_key: text - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest12.json - text_key: text - regex_params_list: - - {"pattern": "[\\?\\.]", "repl": " "} - - {"pattern": ",", "repl": " "} - - {"pattern": " ", "repl": " "} - - - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest13.json - duplicate_fields: {"pred_text":"pred_text_pc"} - - - _target_: sdp.processors.SubMakeLowercase - output_manifest_file: ${workspace_dir}/manifest14.json - text_key: pred_text - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest15.json - text_key: pred_text - regex_params_list: - - {"pattern": "[\\?\\.]", "repl": " "} - - {"pattern": ",", "repl": " "} - - {"pattern": " ", "repl": " "} - - - _target_: sdp.processors.DropIfRegexMatch - output_manifest_file: ${workspace_dir}/manifest16.json - text_key: text - regex_patterns: - - "^\\s*$" - - - _target_: sdp.processors.DropHighWER - output_manifest_file: ${workspace_dir}/manifest17.json - text_key: text - pred_text_key: pred_text - wer_threshold: 75 - - - _target_: sdp.processors.DropHighCER - output_manifest_file: ${workspace_dir}/manifest18.json - text_key: text - pred_text_key: pred_text - cer_threshold: 30 - - - _target_: sdp.processors.datasets.commoncrawl.AlignerSubprocess - input_manifest_file: ${workspace_dir}/manifest16.json - output_manifest_file: ${workspace_dir}/manifest19.json - input_manifest_arg: "manifest_filepath" - output_field: "alignment" - cmd: "python3 /home/nkarpov/workspace/NeMo/tools/nemo_forced_aligner/align.py pretrained_name=stt_en_fastconformer_hybrid_large_pc \ - output_dir=${workspace_dir} batch_size=1 additional_segment_grouping_separator=|" - - - _target_: sdp.processors.datasets.commoncrawl.SplitByAligner - output_manifest_file: ${workspace_dir}/manifest20.json - splited_audio_dir: ${workspace_dir}/nfa - input_field: source_audio - output_field: nfa_filepath - - - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest21.json - duplicate_fields: {"audio_filepath":"audio_filepath_base"} - - - _target_: sdp.processors.RenameFields - output_manifest_file: ${workspace_dir}/manifest22.json - rename_fields: {"nfa_filepath":"audio_filepath"} - - - _target_: sdp.processors.DropHighLowDuration - output_manifest_file: ${workspace_dir}/manifest23.json - high_duration_threshold: 40 - low_duration_threshold: 0.02 - duration_key: nfa_duration - - - _target_: sdp.processors.ASRInference - output_manifest_file: ${workspace_dir}/manifest24.json - pretrained_model: nvidia/stt_en_fastconformer_hybrid_large_pc - batch_size: 64 - - - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest25.json - duplicate_fields: {"pred_text":"pred_text_pc"} - - - _target_: sdp.processors.SubMakeLowercase - output_manifest_file: ${workspace_dir}/manifest26.json - text_key: pred_text - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest27.json - text_key: pred_text - regex_params_list: - - {"pattern": "[\\?\\.]", "repl": " "} - - {"pattern": ",", "repl": " "} - - {"pattern": " ", "repl": " "} - - - _target_: sdp.processors.DropHighWER - output_manifest_file: ${workspace_dir}/manifest28.json - text_key: text - pred_text_key: pred_text - wer_threshold: 75 - - - _target_: sdp.processors.DropHighCER - output_manifest_file: ${workspace_dir}/manifest29.json - text_key: text - pred_text_key: pred_text - cer_threshold: 30 - - - - _target_: sdp.processors.datasets.commoncrawl.JoinBy - input_manifest_file: ${workspace_dir}/manifest16.json - output_manifest_file: ${workspace_dir}/manifest30.json - input_field: source_audio - - - - _target_: sdp.processors.datasets.commoncrawl.Subprocess - output_manifest_file: ${workspace_dir}/manifest31.json - input_manifest_arg: "--data_manifest" - output_manifest_arg: "--out_manifest" - arg_separator: "=" - cmd: "python /home/nkarpov/workspace/NvLLMOps/nvllmops/stages/asr/data_segmentation/ds_align/ds_align.py \ - --splits_dir=/mnt/ssd8/cc_sdp/en/dsa \ - --stt-model-path=/home/nkarpov/ckpts/en/stt_en_conformer_ctc_large_1.1/stt_en_conformer_ctc_large.nemo \ - --stt-model-type=CTC \ - --min-audio-duration=2 \ - --max-audio-duration=40 \ - --asr-batch-size=32" - - - _target_: sdp.processors.DropIfRegexMatch - output_manifest_file: ${workspace_dir}/manifest32.json - text_key: text - regex_patterns: - - "^\\s*$" - - - _target_: sdp.processors.DropHighWER - output_manifest_file: ${workspace_dir}/manifest33.json - text_key: text - pred_text_key: text_asr_pred - wer_threshold: 75 - - - _target_: sdp.processors.DropHighCER - output_manifest_file: ${workspace_dir}/manifest34.json - text_key: text - pred_text_key: text_asr_pred - cer_threshold: 30 - \ No newline at end of file diff --git a/dataset_configs/commoncrawl/small_es.yaml b/dataset_configs/commoncrawl/small_es.yaml deleted file mode 100644 index 03b11418..00000000 --- a/dataset_configs/commoncrawl/small_es.yaml +++ /dev/null @@ -1,160 +0,0 @@ -processors_to_run: "3:" -workspace_dir: /mnt/ssd8/cc_sdp/es - -processors: - - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue - input_manifest_file: /mnt/ssd8/cc_sdp/manifest8ps.json - output_manifest_file: ${workspace_dir}/manifest0.json - input_field: audio_lang - target_value: es - - - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue - output_manifest_file: ${workspace_dir}/manifest1.json - input_field: text_lang - target_value: es - - - _target_: sdp.processors.ASRInference - output_manifest_file: ${workspace_dir}/manifest2.json - pretrained_model: nvidia/stt_es_fastconformer_hybrid_large_pc - batch_size: 64 - - - _target_: sdp.processors.DuplicateFields - input_manifest_file: ${workspace_dir}/manifest2.json - output_manifest_file: ${workspace_dir}/manifest3.json - duplicate_fields: {"text":"orig_text"} - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest4.json - regex_params_list: - - {"pattern": '\[(.*?)\]', "repl": ' '} - - {"pattern": 'î', "repl": "i"} - - {"pattern": 'ì', "repl": "í"} - - {"pattern": 'è', "repl": "é"} - - {"pattern": 'È', "repl": "É"} - - {"pattern": 'ù', "repl": "ú"} - - {"pattern": 'ò', "repl": "ó"} - - {"pattern": 'à', "repl": "á"} - - {"pattern": '‚', "repl": ","} - - {"pattern": "’", "repl": "'"} - - {"pattern": "[-–—]", "repl": " "} - - {"pattern": '―', "repl": "-"} - - {"pattern": '—', "repl": "-"} - - {"pattern": '⁺', "repl": "+"} - - {"pattern": '“', "repl": '"'} - - {"pattern": '”', "repl": '"'} - - {"pattern": '…', "repl": '.'} - - {"pattern": '‘', "repl": "'"} - - {"pattern": '′', "repl": "'"} - - {"pattern": '`', "repl": "'"} - - {"pattern": '⁻', "repl": "-"} - - {"pattern": '‑', "repl": "-"} - - {"pattern": '¶', "repl": ' '} - - {"pattern": '«', "repl": '"'} - - {"pattern": '»', "repl": '"'} - - {"pattern": '„', "repl": '"'} - - {"pattern": '®', "repl": ' '} - - {"pattern": '•', "repl": " "} - - {"pattern": '●', "repl": " "} - - {"pattern": '@', "repl": " "} - - {"pattern": " ", "repl": " "} - - - _target_: sdp.processors.DropHighLowWordrate - output_manifest_file: ${workspace_dir}/manifest5.json - high_wordrate_threshold: 100 - low_wordrate_threshold: 0.01 - - # - _target_: sdp.processors.datasets.commoncrawl.Subprocess - # output_manifest_file: ${workspace_dir}/manifest6.json - # input_manifest_arg: "--input_file" - # output_manifest_arg: "--output_file" - # arg_separator: "=" - # cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize.py \ - # --language=es --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache \ - # --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/es/data/whitelist.tsv" - - - _target_: sdp.processors.datasets.commoncrawl.Subprocess - output_manifest_file: ${workspace_dir}/manifest6.json - input_manifest_arg: "--manifest" - output_manifest_arg: "--output_filename" - arg_separator: "=" - cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \ - --language=es --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache \ - --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/es/data/whitelist.tsv" - - - _target_: sdp.processors.RenameFields - output_manifest_file: ${workspace_dir}/manifest7.json - rename_fields: {"normalized":"text"} - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest8.json - text_key: text - regex_params_list: - - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"} - - {"pattern": "^\\s*'*\\s*", "repl": ""} - - {"pattern": "'{2,}", "repl": "'"} - - {"pattern": '!', "repl": '.'} - - {"pattern": '\\.{3}', "repl": '.'} - - {"pattern": '\$', "repl": ""} - - {"pattern": "[^a-zA-ZóÓáéÉíÍñÑúÚ'.,?¿]", "repl": " "} - - {"pattern": ' ', "repl": " "} - test_cases: - - {input: {text: "' jupiter and venus both shining in the golden rosy sky"}, output: {text: "jupiter and venus both shining in the golden rosy sky"}} - - {input: {text: "' may all the gold i have ever dreamed of be yours '"}, output: {text: "may all the gold i have ever dreamed of be yours"}} - - {input: {text: "''cause it''s an adult novel versus ya"}, output: {text: "cause it's an adult novel versus ya"}} - - - _target_: sdp.processors.DropHighLowWordrate - output_manifest_file: ${workspace_dir}/manifest9.json - high_wordrate_threshold: 100 - low_wordrate_threshold: 0.01 - - - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest10.json - duplicate_fields: {"text":"text_pc"} - - - _target_: sdp.processors.SubMakeLowercase - output_manifest_file: ${workspace_dir}/manifest11.json - text_key: text - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest12.json - text_key: text - regex_params_list: - - {"pattern": "[\\?\\.]", "repl": " "} - - {"pattern": ",", "repl": " "} - - {"pattern": "¿", "repl": " "} - - {"pattern": " ", "repl": " "} - - - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest13.json - duplicate_fields: {"pred_text":"pred_text_pc"} - - - _target_: sdp.processors.SubMakeLowercase - output_manifest_file: ${workspace_dir}/manifest14.json - text_key: pred_text - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest15.json - text_key: pred_text - regex_params_list: - - {"pattern": "[\\?\\.]", "repl": " "} - - {"pattern": ",", "repl": " "} - - {"pattern": "¿", "repl": " "} - - {"pattern": " ", "repl": " "} - - - _target_: sdp.processors.DropIfRegexMatch - output_manifest_file: ${workspace_dir}/manifest16.json - text_key: text - regex_patterns: - - "^\\s*$" - - - _target_: sdp.processors.DropHighWER - output_manifest_file: ${workspace_dir}/manifest17.json - text_key: text - pred_text_key: pred_text - wer_threshold: 75 - - - _target_: sdp.processors.DropHighCER - output_manifest_file: ${workspace_dir}/manifest18.json - text_key: text - pred_text_key: pred_text - cer_threshold: 30 diff --git a/dataset_configs/commoncrawl/small_fr.yaml b/dataset_configs/commoncrawl/small_fr.yaml deleted file mode 100644 index f8699a91..00000000 --- a/dataset_configs/commoncrawl/small_fr.yaml +++ /dev/null @@ -1,120 +0,0 @@ -processors_to_run: "3:" -workspace_dir: /mnt/ssd8/cc_sdp/fr - -processors: - - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue - input_manifest_file: /mnt/ssd8/cc_sdp/manifest8ps.json - output_manifest_file: ${workspace_dir}/manifest0.json - input_field: audio_lang - preserve_value: fr - - - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue - output_manifest_file: ${workspace_dir}/manifest1.json - input_field: text_lang - preserve_value: fr - - - _target_: sdp.processors.ASRInference - output_manifest_file: ${workspace_dir}/manifest2.json - pretrained_model: nvidia/stt_fr_conformer_transducer_large # nvidia/stt_fr_fastconformer_hybrid_large_pc - batch_size: 64 - - - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest3.json - duplicate_fields: {"text":"orig_text"} - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest4.json - regex_params_list: - - {"pattern": '\[(.*?)\]', "repl": ' '} - - {"pattern": '¡', "repl": " "} - - {"pattern": '‚', "repl": ","} - - {"pattern": "’", "repl": "'"} - - {"pattern": "[-–—]", "repl": " "} - - {"pattern": '―', "repl": "-"} - - {"pattern": '—', "repl": "-"} - - {"pattern": '⁺', "repl": "+"} - - {"pattern": '“', "repl": '"'} - - {"pattern": '”', "repl": '"'} - - {"pattern": '…', "repl": '.'} - - {"pattern": '‘', "repl": "'"} - - {"pattern": '′', "repl": "'"} - - {"pattern": '`', "repl": "'"} - - {"pattern": '⁻', "repl": "-"} - - {"pattern": '‑', "repl": "-"} - - {"pattern": '¶', "repl": ' '} - - {"pattern": '«', "repl": '"'} - - {"pattern": '»', "repl": '"'} - - {"pattern": '„', "repl": '"'} - - {"pattern": '®', "repl": ' '} - - {"pattern": '@', "repl": " "} - - {"pattern": " ", "repl": " "} - - - _target_: sdp.processors.DropHighLowWordrate - output_manifest_file: ${workspace_dir}/manifest5.json - high_wordrate_threshold: 100 - low_wordrate_threshold: 0.01 - - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest6.json - text_key: text - regex_params_list: - - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"} - - {"pattern": "^\\s*'*\\s*", "repl": ""} - - {"pattern": "'{2,}", "repl": "'"} - - {"pattern": '\\.{3}', "repl": '.'} - - {"pattern": '\$', "repl": ""} - - {"pattern": "[^a-zA-ZàâçéèêëîïôûùüÿñæœÀÂÇÉÈÊËÎÏÔÛÙÜŸÑÆŒ.,?'-]", "repl": " "} - - {"pattern": ' ', "repl": " "} - - - - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest7.json - duplicate_fields: {"text":"text_pc"} - - - _target_: sdp.processors.SubMakeLowercase - output_manifest_file: ${workspace_dir}/manifest8.json - text_key: text - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest9.json - text_key: text - regex_params_list: - - {"pattern": "[\\?\\.\\!]", "repl": " "} - - {"pattern": ",", "repl": " "} - - {"pattern": " ", "repl": " "} - - - _target_: sdp.processors.DropIfRegexMatch - output_manifest_file: ${workspace_dir}/manifest10.json - text_key: text - regex_patterns: - - "^\\s*$" - - - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest11.json - duplicate_fields: {"pred_text":"pred_text_pc"} - - - _target_: sdp.processors.SubMakeLowercase - output_manifest_file: ${workspace_dir}/manifest12.json - text_key: pred_text - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest13.json - text_key: pred_text - regex_params_list: - - {"pattern": "[\\?\\.\\!]", "repl": " "} - - {"pattern": ",", "repl": " "} - - {"pattern": " ", "repl": " "} - - - _target_: sdp.processors.DropHighWER - output_manifest_file: ${workspace_dir}/manifest14.json - text_key: text - pred_text_key: pred_text - wer_threshold: 75 - - - _target_: sdp.processors.DropHighCER - output_manifest_file: ${workspace_dir}/manifest15.json - text_key: text - pred_text_key: pred_text - cer_threshold: 30 - \ No newline at end of file diff --git a/dataset_configs/commoncrawl/small_pl.yaml b/dataset_configs/commoncrawl/small_pl.yaml deleted file mode 100644 index ba8d1bd2..00000000 --- a/dataset_configs/commoncrawl/small_pl.yaml +++ /dev/null @@ -1,119 +0,0 @@ -processors_to_run: "3:" -workspace_dir: /mnt/ssd8/cc_sdp/pl - -processors: - - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue - input_manifest_file: /mnt/ssd8/cc_sdp/manifest8ps.json - output_manifest_file: ${workspace_dir}/manifest0.json - input_field: audio_lang - preserve_value: pl - - - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue - output_manifest_file: ${workspace_dir}/manifest1.json - input_field: text_lang - preserve_value: pl - - - _target_: sdp.processors.ASRInference - output_manifest_file: ${workspace_dir}/manifest2.json - pretrained_model: nvidia/stt_pl_fastconformer_hybrid_large_pc - batch_size: 64 - - - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest3.json - duplicate_fields: {"text":"orig_text"} - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest4.json - regex_params_list: - - {"pattern": '\[(.*?)\]', "repl": ' '} - - {"pattern": 'î', "repl": "i"} - - {"pattern": 'ì', "repl": "i"} - - {"pattern": 'í', "repl": "i"} - - {"pattern": '¡', "repl": "i"} - - {"pattern": '‚', "repl": ","} - - {"pattern": "’", "repl": "'"} - - {"pattern": "[-–—]", "repl": " "} - - {"pattern": '―', "repl": "-"} - - {"pattern": '—', "repl": "-"} - - {"pattern": '⁺', "repl": "+"} - - {"pattern": '“', "repl": '"'} - - {"pattern": '”', "repl": '"'} - - {"pattern": '…', "repl": '.'} - - {"pattern": '‘', "repl": "'"} - - {"pattern": '′', "repl": "'"} - - {"pattern": '`', "repl": "'"} - - {"pattern": '⁻', "repl": "-"} - - {"pattern": '‑', "repl": "-"} - - {"pattern": '¶', "repl": ' '} - - {"pattern": '«', "repl": '"'} - - {"pattern": '»', "repl": '"'} - - {"pattern": '„', "repl": '"'} - - {"pattern": '®', "repl": ' '} - - {"pattern": " ", "repl": " "} - - - _target_: sdp.processors.DropHighLowWordrate - output_manifest_file: ${workspace_dir}/manifest5.json - high_wordrate_threshold: 100 - low_wordrate_threshold: 0.01 - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest6.json - text_key: text - regex_params_list: - - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"} - - {"pattern": "^\\s*'*\\s*", "repl": ""} - - {"pattern": "'{2,}", "repl": "'"} - - {"pattern": '\\.{3}', "repl": '.'} - - {"pattern": '\$', "repl": ""} - - {"pattern": "[^a-zA-ZĘꥹłŁćĆżŻśŚŃńóÓźŹ.,?]", "repl": " "} - - {"pattern": ' ', "repl": " "} - - - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest7.json - duplicate_fields: {"text":"text_pc"} - - - _target_: sdp.processors.SubMakeLowercase - output_manifest_file: ${workspace_dir}/manifest8.json - text_key: text - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest9.json - text_key: text - regex_params_list: - - {"pattern": "[\\?\\.\\!]", "repl": " "} - - {"pattern": ",", "repl": " "} - - {"pattern": " ", "repl": " "} - - - _target_: sdp.processors.DropIfRegexMatch - output_manifest_file: ${workspace_dir}/manifest10.json - text_key: text - regex_patterns: - - "^\\s*$" - - - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest11.json - duplicate_fields: {"pred_text":"pred_text_pc"} - - - _target_: sdp.processors.SubMakeLowercase - output_manifest_file: ${workspace_dir}/manifest12.json - text_key: pred_text - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest13.json - text_key: pred_text - regex_params_list: - - {"pattern": "[\\?\\.\\!]", "repl": " "} - - {"pattern": ",", "repl": " "} - - {"pattern": " ", "repl": " "} - - - _target_: sdp.processors.DropHighWER - output_manifest_file: ${workspace_dir}/manifest14.json - text_key: text - pred_text_key: pred_text - wer_threshold: 75 - - - _target_: sdp.processors.DropHighCER - output_manifest_file: ${workspace_dir}/manifest15.json - text_key: text - pred_text_key: pred_text - cer_threshold: 30 \ No newline at end of file diff --git a/dataset_configs/commoncrawl/small_sentence.yaml b/dataset_configs/commoncrawl/small_sentence.yaml deleted file mode 100644 index 2e311dd3..00000000 --- a/dataset_configs/commoncrawl/small_sentence.yaml +++ /dev/null @@ -1,119 +0,0 @@ -processors_to_run: "0:" -workspace_dir: /mnt/ssd8/cc_sdp -workspace_dir_diar: /mnt/ssd8/cc_sdp/diarize - -processors: - - _target_: sdp.processors.datasets.commoncrawl.CreateInitialManifestCC - output_manifest_file: ${workspace_dir}/manifest0s.json - raw_data_dir: /mnt/ssd8/cc_download/videos_and_transcripts_large_16_cores_output/video_output2 - video_field: "source_video" - text_field: "texts" - key_field: "key" - - - _target_: sdp.processors.datasets.commoncrawl.ReadParquet - output_manifest_file: ${workspace_dir}/manifest1s.json - raw_data_dir: /mnt/ssd8/cc_download/videos_and_transcripts_large_16_cores_output/video_output2 - output_video_field: video_url - output_caption_field: caption_url - key_field: key - - - _target_: sdp.processors.datasets.commoncrawl.FfmpegConvert - # input_manifest_file:${workspace_dir}/manifest_urls.json - output_manifest_file: ${workspace_dir}/manifest2s.json - resampled_audio_dir: ${workspace_dir}/audio - target_samplerate: 16000 - target_nchannels: 1 - input_field: "source_video" - output_field: "source_audio" - key_field: "key" - - - _target_: sdp.processors.datasets.commoncrawl.AudioDuration - output_manifest_file: ${workspace_dir}/manifest3s.json - input_field: source_audio - output_field: duration - - - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue - output_manifest_file: ${workspace_dir}/manifest4s.json - input_field: duration - target_value: 0 - operator: gt - - - _target_: sdp.processors.datasets.commoncrawl.TxtToVtt - output_manifest_file: ${workspace_dir}/manifest5s.json - vtt_files_dir: ${workspace_dir}/vtts/ - key_field: "key" - text_field: "texts" - vtt_field: "vtt_filepath" - - - _target_: sdp.processors.datasets.commoncrawl.AllVttText - output_manifest_file: ${workspace_dir}/manifest6s.json - input_filepath_field: vtt_filepath - output_text_field: vtt_text - - - _target_: sdp.processors.datasets.commoncrawl.TextLid - output_manifest_file: ${workspace_dir}/manifest7s.json - input_text_field: vtt_text - output_lang_field: text_lang - device: cuda - pretrained_model: "jb2k/bert-base-multilingual-cased-language-detection" - drop_text_duplicates: True - - - _target_: sdp.processors.datasets.commoncrawl.Lang2Iso - output_manifest_file: ${workspace_dir}/manifest4.json - input_lang_field: text_lang - output_lang_field: text_lang - - - _target_: sdp.processors.datasets.commoncrawl.AudioLid - output_manifest_file: ${workspace_dir}/manifest5.json - input_audio_field: audios - output_lang_field: audio_lang - device: cuda - pretrained_model: "langid_ambernet" - - - _target_: sdp.processors.datasets.commoncrawl.SplitByVttSentence - output_manifest_file: ${workspace_dir}/manifest6a.json - splited_audio_dir: ${workspace_dir}/splited_s/ - source_audio_field: audios - vtt_field: "vtt_filepath" - target_audio_field: "audio_filepath" - duration_field: "duration" - text_field: "text" - proxy_fields: [audio_lang, text_lang, audios] - duration_threshold: 10.0 - - - _target_: sdp.processors.DropHighLowDuration - output_manifest_file: ${workspace_dir}/manifest7a.json - high_duration_threshold: 40 - low_duration_threshold: 0.02 - - - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest8a.json - duplicate_fields: {"audios": "source_audio"} - - - _target_: sdp.processors.KeepOnlySpecifiedFields - output_manifest_file: ${workspace_dir}/manifest9a.json - fields_to_keep: ["audio_filepath", "duration", "text", "audio_lang", "text_lang", "source_audio"] - - - _target_: sdp.processors.datasets.commoncrawl.EvalBandwidth - output_manifest_file: ${workspace_dir}/manifest10a.json - input_field: audio_filepath - output_field: bandwidth - - - _target_: sdp.processors.RenameFields - input_manifest_file: ${workspace_dir}/manifest5.json - output_manifest_file: ${workspace_dir_diar}/manifest0.json - rename_fields: {"source_audio":"audio_filepath"} - - - _target_: sdp.processors.datasets.commoncrawl.Subprocess - input_manifest_arg: "diarizer.manifest_filepath" - arg_separator: "=" - cmd: "python /home/nkarpov/workspace/NeMo/examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_infer.py \ - --config-path=/home/nkarpov/workspace/NeMo/examples/speaker_tasks/diarization/conf/inference/ --config-name=diar_infer_general.yaml \ - diarizer.out_dir=${workspace_dir_diar} \ - diarizer.speaker_embeddings.parameters.save_embeddings=False \ - diarizer.vad.model_path=/home/nkarpov/ckpts/diar/vad_multilingual_marblenet.nemo \ - diarizer.speaker_embeddings.model_path=/home/nkarpov/ckpts/diar/titanet-l.nemo \ - diarizer.clustering.parameters.max_num_speakers=4 \ - diarizer.clustering.parameters.enhanced_count_thres=80 \ - diarizer.vad.parameters.onset=0.1 \ - diarizer.vad.parameters.offset=0.1 " \ No newline at end of file diff --git a/sdp/processors/huggingface/llm.py b/sdp/processors/huggingface/llm.py deleted file mode 100644 index b71286f3..00000000 --- a/sdp/processors/huggingface/llm.py +++ /dev/null @@ -1,217 +0,0 @@ -import json -import os -import subprocess -from pathlib import Path -from typing import Dict, List, Optional, Union - -from sdp.logging import logger -from sdp.processors.base_processor import ( - BaseParallelProcessor, - BaseProcessor, - DataEntry, -) -from sdp.utils.common import load_manifest - - -class ApplyLlama3(BaseProcessor): - """ - Processor to prompt llm model from HuggingFace. - - Args: - input_example_manifest (str): Assistent example manifest file. - example_query_key (str): Field name that contains examples queries. - example_response_key (str): Field name that contains examples ground truth responses. - pretrained_model (str): Pretrained model name. - input_text_key (str): Field name that contains input text. - message (str): LLM command text. - torch_dtype (str): Tensor data type. Default to "float16" (as llama3 is trained so). - output_text_key (str): Key to save result. - """ - - def __init__( - self, - input_example_manifest: str = None, - example_query_key: str = "text", - example_response_key: str = "text_pc", - pretrained_model: str = "meta-llama/Meta-Llama-3-8B-Instruct", - input_text_key: str = "text", - main_promt: List[str] = [ - "Add missing punctuation marks. Don't change the words of the text. Keep the text as it is." - ], - torch_dtype: str = "float16", - output_text_key: str = "text_pc", - **kwargs, - ): - super().__init__(**kwargs) - try: - import torch - import transformers - except: - raise ImportError("Need to install transformers: pip install accelerate transformers") - - logger.warning("This is an example processor, for demonstration only. Do not use it for production purposes.") - self.pretrained_model = pretrained_model - self.example_query_key = example_query_key - self.example_response_key = example_response_key - self.input_example_manifest = input_example_manifest - self.input_text_key = input_text_key - self.output_text_key = output_text_key - self.message = " ".join(main_promt) - if torch_dtype == "float32": - self.torch_dtype = torch.float32 - elif torch_dtype == "float16": - self.torch_dtype = torch.float16 - else: - raise NotImplementedError(torch_dtype + " is not implemented!") - - self.pipeline = transformers.pipeline( - "text-generation", - model=self.pretrained_model, - model_kwargs={"torch_dtype": self.torch_dtype}, - device="cuda", - ) - - self.messages = [{"role": "system", "content": self.message}] - if self.input_example_manifest: - example_manifest = load_manifest(Path(self.input_example_manifest)) - for data_entry in example_manifest: - self.messages.append({"role": "user", "content": data_entry[self.example_query_key]}) - self.messages.append({"role": "assistant", "content": data_entry[self.example_response_key]}) - - def process(self): - data_entries = load_manifest(Path(self.input_manifest_file)) - - with Path(self.output_manifest_file).open("w") as f: - for data_entry in data_entries: - messages = self.messages.copy() - messages.append({"role": "user", "content": data_entry[self.input_text_key]}) - - prompt = self.pipeline.tokenizer.apply_chat_template( - messages, tokenize=False, add_generation_prompt=True - ) - - terminators = [ - self.pipeline.tokenizer.eos_token_id, - self.pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>"), - ] - - outputs = self.pipeline( - prompt, - max_new_tokens=2 * len(data_entry[self.input_text_key]), - eos_token_id=terminators, - do_sample=True, - temperature=0.6, - top_p=0.9, - ) - - data_entry[self.output_text_key] = outputs[0]["generated_text"][len(prompt) :] - f.write(json.dumps(data_entry, ensure_ascii=False) + "\n") - - -class Subprocess(BaseProcessor): - """ - Processor for handling subprocess execution with additional features for managing input and output manifests. - - Args: - cmd (str): The command to be executed as a subprocess. - input_manifest_arg (str, optional): The argument specifying the input manifest. Defaults to an empty string. - output_manifest_arg (str, optional): The argument specifying the output manifest. Defaults to an empty string. - arg_separator (str, optional): The separator used between argument and value. Defaults to "=". - shell (bool, optional): The argument specifies whether to use shell for subprocess.run(). Defaults to False. - dont_wait (bool, optional): The argument specifies whether to wait while the subprocess finishes. . Defaults to False. - **kwargs: Additional keyword arguments to be passed to the base class. - - Example: - - _target_: sdp.processors.datasets.commoncrawl.Subprocess - output_manifest_file: /workspace/manifest.json - input_manifest_arg: "--manifest" - output_manifest_arg: "--output_filename" - arg_separator: "=" - cmd: "python /workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \ - --language=en --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache --overwrite_cache \ - --whitelist=/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv" - """ - - def __init__( - self, - cmd: str, - input_manifest_arg: str | None = None, - output_manifest_arg: str | None = None, - arg_separator: str = "=", - shell: bool = False, - dont_wait: bool = False, - **kwargs, - ): - super().__init__(**kwargs) - self.input_manifest_arg = input_manifest_arg - self.output_manifest_arg = output_manifest_arg - self.arg_separator = arg_separator - self.cmd = cmd - self.shell = shell - self.dont_wait = dont_wait - - def process(self): - os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True) - if ( - self.input_manifest_arg is not None - and self.cmd.find(self.input_manifest_file) != -1 - or self.output_manifest_arg is not None - and self.cmd.find(self.output_manifest_file) != -1 - ): - raise ValueError( - "input_manifest_file " - + self.input_manifest_file - + " and output_manifest_file " - + self.output_manifest_file - + " should be exluded from cmd line: " - + self.cmd - ) - process_args = [x for x in self.cmd.split(" ") if x] - if self.arg_separator == " ": - if self.input_manifest_arg: - process_args.extend([self.input_manifest_arg, self.input_manifest_file]) - if self.output_manifest_arg: - process_args.extend([self.output_manifest_arg, self.output_manifest_file]) - else: - if self.input_manifest_arg: - process_args.extend([self.input_manifest_arg + self.arg_separator + self.input_manifest_file]) - if self.output_manifest_arg: - process_args.extend([self.output_manifest_arg + self.arg_separator + self.output_manifest_file]) - if self.shell: - process_args = " ".join(process_args) - logger.info("subprocess shell: " + process_args) - - if self.dont_wait: - logger.warning("dont_wait flag is True, no logs captures!") - subprocess.Popen(process_args, shell=self.shell, stdin=None, stdout=None, stderr=None, close_fds=True) - else: - subprocess.run(process_args, shell=self.shell) - - -class WriteTxtFiles(BaseParallelProcessor): - """ """ - - def __init__( - self, - text_key: Dict, - audio_key: Dict, - output_dir: str, - **kwargs, - ): - super().__init__(**kwargs) - self.audio_key = audio_key - self.text_key = text_key - self.output_dir = output_dir - - def prepare(self): - os.makedirs(self.output_dir, exist_ok=True) - - def process_dataset_entry(self, data_entry: Dict): - text = data_entry[self.text_key] - audiofile_path = data_entry[self.audio_key] - base_name = os.path.splitext(os.path.split(audiofile_path)[1])[0] - output_name = os.path.join(self.output_dir, base_name + ".txt") - with open(output_name, 'w') as file: - file.write(text) - return [DataEntry(data=data_entry)] From e9bb5db90433dc858de9a60444139dbfa7db3891 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Sun, 24 Nov 2024 06:07:24 -0800 Subject: [PATCH 115/115] rm extra langs Signed-off-by: Nikolay Karpov --- dataset_configs/commoncrawl/big.yaml | 101 ---------- dataset_configs/commoncrawl/big_de_en.yaml | 142 -------------- dataset_configs/commoncrawl/big_en_de.yaml | 131 ------------- dataset_configs/commoncrawl/big_en_fr.yaml | 122 ------------ dataset_configs/commoncrawl/big_es.yaml | 218 --------------------- dataset_configs/commoncrawl/big_eu.yaml | 113 ----------- dataset_configs/commoncrawl/big_fr_en.yaml | 138 ------------- dataset_configs/commoncrawl/big_it.yaml | 150 -------------- dataset_configs/commoncrawl/big_nl.yaml | 128 ------------ 9 files changed, 1243 deletions(-) delete mode 100644 dataset_configs/commoncrawl/big.yaml delete mode 100644 dataset_configs/commoncrawl/big_de_en.yaml delete mode 100644 dataset_configs/commoncrawl/big_en_de.yaml delete mode 100644 dataset_configs/commoncrawl/big_en_fr.yaml delete mode 100644 dataset_configs/commoncrawl/big_es.yaml delete mode 100644 dataset_configs/commoncrawl/big_eu.yaml delete mode 100644 dataset_configs/commoncrawl/big_fr_en.yaml delete mode 100644 dataset_configs/commoncrawl/big_it.yaml delete mode 100644 dataset_configs/commoncrawl/big_nl.yaml diff --git a/dataset_configs/commoncrawl/big.yaml b/dataset_configs/commoncrawl/big.yaml deleted file mode 100644 index 44199a43..00000000 --- a/dataset_configs/commoncrawl/big.yaml +++ /dev/null @@ -1,101 +0,0 @@ -processors_to_run: "0:" -workspace_dir: /mnt/md1/out # /mnt/md1/common_crawl/cc_sdp -workspace_dir_s: /mnt/md0/out - -processors: - - _target_: sdp.processors.datasets.commoncrawl.CreateInitialManifestCC - raw_data_dir: /mnt/md1/out/output_valid_captions - output_manifest_file: ${workspace_dir}/manifest0.json - video_field: "videos" - key_field: "key" - text_field: "texts" - - - _target_: sdp.processors.datasets.commoncrawl.ReadParquet - raw_data_dir: /mnt/md1/out/output_valid_captions - output_manifest_file: ${workspace_dir}/manifest1.json - output_video_field: video_url - output_caption_field: caption_url - key_field: key - - - _target_: sdp.processors.datasets.commoncrawl.FfmpegConvert - output_manifest_file: ${workspace_dir}/manifest2.json #${workspace_dir_s}/manifest_urls.json - resampled_audio_dir: ${workspace_dir_s}/audio - target_samplerate: 16000 - target_nchannels: 1 - input_field: "videos" - output_field: "audios" - key_field: "key" - - - _target_: sdp.processors.datasets.commoncrawl.AudioDuration - output_manifest_file: ${workspace_dir}/manifest3.json - input_field: audios - output_field: duration - - - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue - output_manifest_file: ${workspace_dir}/manifest4.json - input_field: duration - target_value: 0 - operator: gt - - - _target_: sdp.processors.datasets.commoncrawl.TxtToVtt - output_manifest_file: ${workspace_dir}/manifest5.json - vtt_files_dir: ${workspace_dir}/vtts/ - key_field: "key" - text_field: "texts" - vtt_field: "vtt_filepath" - - - _target_: sdp.processors.datasets.commoncrawl.AllVttText - output_manifest_file: ${workspace_dir}/manifest6.json - input_filepath_field: vtt_filepath - output_text_field: vtt_text - - - _target_: sdp.processors.datasets.commoncrawl.TextLid - output_manifest_file: ${workspace_dir}/manifest7.json - input_text_field: vtt_text - output_lang_field: text_lang - device: cuda - pretrained_model: "jb2k/bert-base-multilingual-cased-language-detection" - drop_text_duplicates: True - - - _target_: sdp.processors.datasets.commoncrawl.Lang2Iso - output_manifest_file: ${workspace_dir}/manifest8.json - input_lang_field: text_lang - output_lang_field: text_lang - - - _target_: sdp.processors.datasets.commoncrawl.AudioLid - output_manifest_file: ${workspace_dir}/manifest9.json - input_audio_field: audios - output_lang_field: audio_lang - device: cuda - pretrained_model: "langid_ambernet" - - - _target_: sdp.processors.datasets.commoncrawl.SplitByVttSentence - output_manifest_file: ${workspace_dir}/manifest10.json - splited_audio_dir: ${workspace_dir}/splited - source_audio_field: audios - target_audio_field: audio_filepath - duration_field: duration - text_field: text - vtt_field: vtt_filepath - proxy_fields: [audio_lang, text_lang, audios] - duration_threshold: 10.0 - - - _target_: sdp.processors.DropHighLowDuration - output_manifest_file: ${workspace_dir}/manifest11.json - high_duration_threshold: 60 - low_duration_threshold: 0.01 - - - _target_: sdp.processors.RenameFields - input_manifest_file: ${workspace_dir}/manifest9.json - output_manifest_file: ${workspace_dir}/manifest12.json - rename_fields: {"audios":"audio_filepath"} - - - _target_: sdp.processors.datasets.commoncrawl.Subprocess - output_manifest_file: ${workspace_dir}/manifest13.json - input_manifest_arg: "diarizer.manifest_filepath" - arg_separator: "=" - cmd: "python /home/nkarpov/workspace/NeMo/examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_infer.py \ - diarizer.out_dir=${workspace_dir}/diar \ - diarizer.speaker_embeddings.parameters.save_embeddings=False \ - diarizer.vad.model_path=/home/nkarpov/ckpts/diar/vad_multilingual_marblenet.nemo \ - diarizer.speaker_embeddings.model_path=/home/nkarpov/ckpts/diar/titanet-l.nemo" \ No newline at end of file diff --git a/dataset_configs/commoncrawl/big_de_en.yaml b/dataset_configs/commoncrawl/big_de_en.yaml deleted file mode 100644 index eb429f45..00000000 --- a/dataset_configs/commoncrawl/big_de_en.yaml +++ /dev/null @@ -1,142 +0,0 @@ -processors_to_run: "14:" -workspace_dir: /mnt/md0/common_crawl/cc_sdp/de_en - -processors: - - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue - input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8.json - output_manifest_file: ${workspace_dir}/manifest0.json - input_field: audio_lang - target_value: de - - - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue - output_manifest_file: ${workspace_dir}/manifest1.json - input_field: text_lang - target_value: en - - - _target_: sdp.processors.ASRInference - output_manifest_file: ${workspace_dir}/manifest2.json - pretrained_model: nvidia/stt_de_fastconformer_hybrid_large_pc - batch_size: 64 - - - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest3.json - duplicate_fields: {"text":"orig_text"} - - - _target_: sdp.processors.RenameFields - output_manifest_file: ${workspace_dir}/manifest4.json - rename_fields: {"pred_text": "asr_text"} - - - _target_: sdp.processors.DropIfRegexMatch - output_manifest_file: ${workspace_dir}/manifest5.json - text_key: asr_text - regex_patterns: - - "^\\s*$" - - - _target_: sdp.processors.datasets.commoncrawl.NmtSubprocess - output_manifest_file: ${workspace_dir}/manifest6.json - arg_separator: "=" - srctext_file: ${workspace_dir}/srctext.txt # --srctext=${workspace_dir}/srctext.txt - tgtout_file: ${workspace_dir}/tgtout.txt # --tgtout=${workspace_dir}/tgtout.txt - input_field: "asr_text" - output_field: "pred_text" - cmd: "python /home/nkarpov/workspace/NeMo/examples/nlp/machine_translation/nmt_transformer_infer.py \ - --model=${workspace_dir}/nmt_de_en_transformer12x2.nemo --target_lang=en --source_lang=de" - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest7.json - regex_params_list: - - {"pattern": '\[(.*?)\]', "repl": ' '} - - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"} - - {"pattern": 'î', "repl": "i"} - - {"pattern": 'ì', "repl": "i"} - - {"pattern": 'í', "repl": "i"} - - {"pattern": '‚', "repl": ","} - - {"pattern": "’", "repl": "'"} - - {"pattern": "[-–—]", "repl": " "} - - {"pattern": '―', "repl": "-"} - - {"pattern": '—', "repl": "-"} - - {"pattern": '⁺', "repl": "+"} - - {"pattern": '“', "repl": '"'} - - {"pattern": '”', "repl": '"'} - - {"pattern": '…', "repl": '.'} - - {"pattern": '‘', "repl": "'"} - - {"pattern": '′', "repl": "'"} - - {"pattern": '`', "repl": "'"} - - {"pattern": '⁻', "repl": "-"} - - {"pattern": '‑', "repl": "-"} - - {"pattern": '¶', "repl": ' '} - - {"pattern": '«', "repl": '"'} - - {"pattern": '»', "repl": '"'} - - {"pattern": '„', "repl": '"'} - - {"pattern": '®', "repl": ' '} - - {"pattern": " ", "repl": " "} - - - _target_: sdp.processors.DropHighLowWordrate - output_manifest_file: ${workspace_dir}/manifest8.json - high_wordrate_threshold: 100 - low_wordrate_threshold: 0.01 - - - _target_: sdp.processors.datasets.commoncrawl.Subprocess - output_manifest_file: ${workspace_dir}/manifest9.json - input_manifest_arg: "--manifest" - output_manifest_arg: "--output_filename" - arg_separator: "=" - cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \ - --language=en --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache --overwrite_cache \ - --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv" - - - _target_: sdp.processors.RenameFields - output_manifest_file: ${workspace_dir}/manifest10.json - rename_fields: {"normalized":"text"} - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest11.json - text_key: text - regex_params_list: - - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"} - - {"pattern": "^\\s*'*\\s*", "repl": ""} - - {"pattern": "'{2,}", "repl": "'"} - - {"pattern": '\\.{3}', "repl": '.'} - - {"pattern": '\$', "repl": ""} - - {"pattern": "[^A-Za-zäöüÄÖÜß'.,?]", "repl": " "} - - {"pattern": ' ', "repl": " "} - test_cases: - - {input: {text: "' jupiter and venus both shining in the golden rosy sky"}, output: {text: "jupiter and venus both shining in the golden rosy sky"}} - - {input: {text: "' may all the gold i have ever dreamed of be yours '"}, output: {text: "may all the gold i have ever dreamed of be yours"}} - - {input: {text: "''cause it''s an adult novel versus ya"}, output: {text: "cause it's an adult novel versus ya"}} - - - - _target_: sdp.processors.DropIfRegexMatch - output_manifest_file: ${workspace_dir}/manifest12.json - text_key: text - regex_patterns: - - "^\\s*$" - - - _target_: sdp.processors.datasets.commoncrawl.BLEUScore - output_manifest_file: ${workspace_dir}/manifest13.json - ref_field: text - hyp_field: pred_text - output_field: bleu - - - _target_: sdp.processors.datasets.commoncrawl.UseSonar - output_manifest_file: ${workspace_dir}/manifest14.json - input_text_field: text - input_audio_field: audio_filepath - output_field: sonar_dist - device: cuda - speech_encoder_model: sonar_speech_encoder_deu - text_encoder_model: text_sonar_basic_encoder - text_encoder_lang: eng_Latn - batch_size: 64 - - - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue - output_manifest_file: ${workspace_dir}/manifest15s.json - input_field: sonar_dist - target_value: 0.1 - operator: le - - # - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue - # output_manifest_file: ${workspace_dir}/manifest15.json - # input_field: bleu - # target_value: 10 - # operator: ge \ No newline at end of file diff --git a/dataset_configs/commoncrawl/big_en_de.yaml b/dataset_configs/commoncrawl/big_en_de.yaml deleted file mode 100644 index a39dc84c..00000000 --- a/dataset_configs/commoncrawl/big_en_de.yaml +++ /dev/null @@ -1,131 +0,0 @@ -processors_to_run: "0:" -workspace_dir: /mnt/md0/common_crawl/cc_sdp/en_de - -processors: - - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue - input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8ps.json - output_manifest_file: ${workspace_dir}/manifest0.json - input_field: audio_lang - target_value: en - - - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue - output_manifest_file: ${workspace_dir}/manifest1.json - input_field: text_lang - target_value: de - - - _target_: sdp.processors.ASRInference - output_manifest_file: ${workspace_dir}/manifest2.json - pretrained_model: nvidia/stt_en_fastconformer_hybrid_large_pc - batch_size: 64 - - - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest3.json - duplicate_fields: {"text":"orig_text"} - - - _target_: sdp.processors.RenameFields - output_manifest_file: ${workspace_dir}/manifest4.json - rename_fields: {"pred_text": "asr_text"} - - - _target_: sdp.processors.DropIfRegexMatch - output_manifest_file: ${workspace_dir}/manifest5.json - text_key: asr_text - regex_patterns: - - "^\\s*$" - - - _target_: sdp.processors.datasets.commoncrawl.NmtSubprocess - output_manifest_file: ${workspace_dir}/manifest6.json - arg_separator: "=" - srctext_file: ${workspace_dir}/srctext.txt # --srctext=${workspace_dir}/srctext.txt - tgtout_file: ${workspace_dir}/tgtout.txt # --tgtout=${workspace_dir}/tgtout.txt - input_field: "asr_text" - output_field: "pred_text" - cmd: "python /home/nkarpov/workspace/NeMo/examples/nlp/machine_translation/nmt_transformer_infer.py \ - --model=${workspace_dir}/nmt_en_de_transformer12x2.nemo --target_lang=de --source_lang=en" - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest7.json - regex_params_list: - - {"pattern": '\[(.*?)\]', "repl": ' '} - - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"} - - {"pattern": 'î', "repl": "i"} - - {"pattern": 'ì', "repl": "i"} - - {"pattern": 'í', "repl": "i"} - - {"pattern": '¡', "repl": "i"} - - {"pattern": '‚', "repl": ","} - - {"pattern": "’", "repl": "'"} - - {"pattern": "[-–—]", "repl": " "} - - {"pattern": '―', "repl": "-"} - - {"pattern": '—', "repl": "-"} - - {"pattern": '⁺', "repl": "+"} - - {"pattern": '“', "repl": '"'} - - {"pattern": '”', "repl": '"'} - - {"pattern": '…', "repl": '.'} - - {"pattern": '‘', "repl": "'"} - - {"pattern": '′', "repl": "'"} - - {"pattern": '`', "repl": "'"} - - {"pattern": '⁻', "repl": "-"} - - {"pattern": '‑', "repl": "-"} - - {"pattern": '¶', "repl": ' '} - - {"pattern": '«', "repl": '"'} - - {"pattern": '»', "repl": '"'} - - {"pattern": '„', "repl": '"'} - - {"pattern": '®', "repl": ' '} - - {"pattern": '@', "repl": ' '} - - {"pattern": " ", "repl": " "} - - - _target_: sdp.processors.DropHighLowWordrate - output_manifest_file: ${workspace_dir}/manifest8.json - high_wordrate_threshold: 100 - low_wordrate_threshold: 0.01 - - - _target_: sdp.processors.datasets.commoncrawl.Subprocess - output_manifest_file: ${workspace_dir}/manifest9.json - input_manifest_arg: "--manifest" - output_manifest_arg: "--output_filename" - arg_separator: "=" - cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \ - --language=de --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache --overwrite_cache \ - --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/de/data/whitelist.tsv" - - - _target_: sdp.processors.RenameFields - output_manifest_file: ${workspace_dir}/manifest10.json - rename_fields: {"normalized":"text"} - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest11.json - text_key: text - regex_params_list: - - {"pattern": 'ç', "repl": "c"} - - {"pattern": '\\.{3}', "repl": '.'} - - {"pattern": '\$', "repl": ""} - - {"pattern": "[^A-Za-zäöüÄÖÜß.,?]", "repl": " "} - - {"pattern": ' ', "repl": " "} - - - _target_: sdp.processors.DropIfRegexMatch - output_manifest_file: ${workspace_dir}/manifest12.json - text_key: text - regex_patterns: - - "^\\s*$" - - - _target_: sdp.processors.datasets.commoncrawl.BLEUScore - output_manifest_file: ${workspace_dir}/manifest13.json - ref_field: text - hyp_field: pred_text - output_field: bleu - - - _target_: sdp.processors.datasets.commoncrawl.UseSonar - output_manifest_file: ${workspace_dir}/manifest14.json - input_text_field: text - input_audio_field: audio_filepath - output_field: sonar_dist - device: cuda - speech_encoder_model: sonar_speech_encoder_eng - text_encoder_model: text_sonar_basic_encoder - text_encoder_lang: deu_Latn - batch_size: 64 - - - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue - output_manifest_file: ${workspace_dir}/manifest15.json - input_field: bleu - target_value: 30 - operator: ge diff --git a/dataset_configs/commoncrawl/big_en_fr.yaml b/dataset_configs/commoncrawl/big_en_fr.yaml deleted file mode 100644 index 441d665b..00000000 --- a/dataset_configs/commoncrawl/big_en_fr.yaml +++ /dev/null @@ -1,122 +0,0 @@ -processors_to_run: "12:" -workspace_dir: /mnt/md0/common_crawl/cc_sdp/en_fr - -processors: - - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue - input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8ps.json - output_manifest_file: ${workspace_dir}/manifest0.json - input_field: audio_lang - target_value: en - - - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue - output_manifest_file: ${workspace_dir}/manifest1.json - input_field: text_lang - target_value: fr - - - _target_: sdp.processors.ASRInference - output_manifest_file: ${workspace_dir}/manifest2.json - pretrained_model: nvidia/stt_en_fastconformer_hybrid_large_pc - batch_size: 64 - - - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest3.json - duplicate_fields: {"text":"orig_text"} - - - _target_: sdp.processors.RenameFields - output_manifest_file: ${workspace_dir}/manifest4.json - rename_fields: {"pred_text": "asr_text"} - - - _target_: sdp.processors.DropIfRegexMatch - output_manifest_file: ${workspace_dir}/manifest5.json - text_key: asr_text - regex_patterns: - - "^\\s*$" - - - _target_: sdp.processors.datasets.commoncrawl.NmtSubprocess - output_manifest_file: ${workspace_dir}/manifest6.json - arg_separator: "=" - srctext_file: ${workspace_dir}/srctext.txt # --srctext=${workspace_dir}/srctext.txt - tgtout_file: ${workspace_dir}/tgtout.txt # --tgtout=${workspace_dir}/tgtout.txt - input_field: "asr_text" - output_field: "pred_text" - cmd: "python /home/nkarpov/workspace/NeMo/examples/nlp/machine_translation/nmt_transformer_infer.py \ - --model=${workspace_dir}/nmt_en_fr_transformer12x2.nemo --target_lang=fr --source_lang=en" - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest7.json - regex_params_list: - - {"pattern": '\[(.*?)\]', "repl": ' '} - - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"} - - {"pattern": '‚', "repl": ","} - - {"pattern": "’", "repl": "'"} - - {"pattern": "[-–—]", "repl": " "} - - {"pattern": '―', "repl": "-"} - - {"pattern": '—', "repl": "-"} - - {"pattern": '⁺', "repl": "+"} - - {"pattern": '“', "repl": '"'} - - {"pattern": '”', "repl": '"'} - - {"pattern": '…', "repl": '.'} - - {"pattern": '‘', "repl": "'"} - - {"pattern": '′', "repl": "'"} - - {"pattern": '`', "repl": "'"} - - {"pattern": '⁻', "repl": "-"} - - {"pattern": '‑', "repl": "-"} - - {"pattern": '¶', "repl": ' '} - - {"pattern": '«', "repl": '"'} - - {"pattern": '»', "repl": '"'} - - {"pattern": '„', "repl": '"'} - - {"pattern": '®', "repl": ' '} - - {"pattern": '@', "repl": " "} - # - {"pattern": "%", "repl": " "} - - {"pattern": " ", "repl": " "} - - - _target_: sdp.processors.DropHighLowWordrate - output_manifest_file: ${workspace_dir}/manifest8.json - high_wordrate_threshold: 100 - low_wordrate_threshold: 0.01 - - # - _target_: sdp.processors.datasets.commoncrawl.Subprocess - # # input_manifest_file: ${workspace_dir}/manifest7.json - # output_manifest_file: ${workspace_dir}/manifest10.json - # input_manifest_arg: "--manifest" - # output_manifest_arg: "--output_filename" - # arg_separator: "=" - # cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \ - # --language=fr --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache --overwrite_cache \ - # --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/fr/data/whitelist.tsv" - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest9.json - text_key: text - regex_params_list: - - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"} - - {"pattern": "^\\s*'*\\s*", "repl": ""} - - {"pattern": "'{2,}", "repl": "'"} - - {"pattern": '\\.{3}', "repl": '.'} - - {"pattern": '\$', "repl": ""} - - {"pattern": "[^a-zA-ZàâçéèêëîïôûùüÿñæœÀÂÇÉÈÊËÎÏÔÛÙÜŸÑÆŒ.,?'-]", "repl": " "} - - {"pattern": ' ', "repl": " "} - - - - _target_: sdp.processors.datasets.commoncrawl.BLEUScore - output_manifest_file: ${workspace_dir}/manifest10.json - ref_field: text - hyp_field: pred_text - output_field: bleu - - - _target_: sdp.processors.datasets.commoncrawl.UseSonar - output_manifest_file: ${workspace_dir}/manifest11.json - input_text_field: text - input_audio_field: audio_filepath - output_field: sonar_dist - device: cuda - speech_encoder_model: sonar_speech_encoder_eng - text_encoder_model: text_sonar_basic_encoder - text_encoder_lang: fra_Latn - batch_size: 64 - - - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue - output_manifest_file: ${workspace_dir}/manifest12.json - input_field: bleu - target_value: 30 - operator: ge diff --git a/dataset_configs/commoncrawl/big_es.yaml b/dataset_configs/commoncrawl/big_es.yaml deleted file mode 100644 index dda3e771..00000000 --- a/dataset_configs/commoncrawl/big_es.yaml +++ /dev/null @@ -1,218 +0,0 @@ -processors_to_run: "0:" -workspace_dir: /mnt/md1/out/es #/mnt/md0/common_crawl/cc_sdp/es - -processors: - - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue - input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest9a.json - output_manifest_file: ${workspace_dir}/manifest0.json - input_field: audio_lang - target_value: es - - - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue - output_manifest_file: ${workspace_dir}/manifest1.json - input_field: text_lang - target_value: es - - - _target_: sdp.processors.ASRInference - output_manifest_file: ${workspace_dir}/manifest2.json - pretrained_model: nvidia/stt_es_fastconformer_hybrid_large_pc - batch_size: 64 - - - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest3.json - duplicate_fields: {"text":"orig_text"} - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest4.json - text_key: text - regex_params_list: - - {"pattern": '\[(.*?)\]', "repl": ' '} - - {"pattern": '\((.*?)\)', "repl": ' '} - - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"} - - {"pattern": 'î', "repl": "i"} - - {"pattern": 'ì', "repl": "i"} - - {"pattern": 'è', "repl": "e"} - - {"pattern": 'È', "repl": "E"} - - {"pattern": 'ù', "repl": "u"} - - {"pattern": 'ò', "repl": "o"} - - {"pattern": 'à', "repl": "a"} - - {"pattern": '‚', "repl": ","} - - {"pattern": "’", "repl": "'"} - - {"pattern": "[-–—]", "repl": " "} - - {"pattern": '―', "repl": "-"} - - {"pattern": '—', "repl": "-"} - - {"pattern": '⁺', "repl": "+"} - - {"pattern": '“', "repl": '"'} - - {"pattern": '”', "repl": '"'} - - {"pattern": '…', "repl": '.'} - - {"pattern": '‘', "repl": "'"} - - {"pattern": '′', "repl": "'"} - - {"pattern": '`', "repl": "'"} - - {"pattern": '⁻', "repl": "-"} - - {"pattern": '‑', "repl": "-"} - - {"pattern": '¶', "repl": ' '} - - {"pattern": '«', "repl": '"'} - - {"pattern": '»', "repl": '"'} - - {"pattern": '„', "repl": '"'} - - {"pattern": '®', "repl": ' '} - - {"pattern": '•', "repl": " "} - - {"pattern": '●', "repl": " "} - - {"pattern": '@', "repl": " "} - - {"pattern": " ", "repl": " "} - - - _target_: sdp.processors.DropHighLowWordrate - output_manifest_file: ${workspace_dir}/manifest5.json - text_key: text - high_wordrate_threshold: 100 - low_wordrate_threshold: 0.01 - - - _target_: sdp.processors.DropIfRegexMatch - output_manifest_file: ${workspace_dir}/manifest6.json - text_key: text - regex_patterns: - - "^\\s*$" - - - _target_: sdp.processors.datasets.commoncrawl.Subprocess - output_manifest_file: ${workspace_dir}/manifest7.json - input_manifest_arg: "--manifest" - output_manifest_arg: "--output_filename" - arg_separator: "=" - cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \ - --language=es --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache \ - --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/es/data/whitelist.tsv" - # --overwrite_cache - - - _target_: sdp.processors.RenameFields - output_manifest_file: ${workspace_dir}/manifest8.json - rename_fields: {"normalized":"text"} - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest9.json - text_key: text - regex_params_list: - - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"} - - {"pattern": "^\\s*'*\\s*", "repl": ""} - - {"pattern": "'{2,}", "repl": "'"} - - {"pattern": '!', "repl": '.'} - - {"pattern": '\s(\\x[a-h][0-9]){1,}\s', "repl": ' '} - - {"pattern": '(\\x[a-h][0-9]){1,}', "repl": ''} - - {"pattern": '\.{3}', "repl": '.'} - - {"pattern": '\$', "repl": ""} - - {"pattern": "[^a-zA-ZóÓáÁéÉíÍñÑúÚüÜ'.,?¿]", "repl": " "} - - {"pattern": ' ', "repl": " "} - test_cases: - - {input: {text: "' jupiter and venus both shining in the golden rosy sky"}, output: {text: "jupiter and venus both shining in the golden rosy sky"}} - - {input: {text: "' may all the gold i have ever dreamed of be yours '"}, output: {text: "may all the gold i have ever dreamed of be yours"}} - - {input: {text: "''cause it''s an adult novel versus ya"}, output: {text: "cause it's an adult novel versus ya"}} - - - - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest10.json - duplicate_fields: {"text":"text_pc"} - - - _target_: sdp.processors.SubMakeLowercase - output_manifest_file: ${workspace_dir}/manifest11.json - text_key: text - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest12.json - text_key: text - regex_params_list: - - {"pattern": "[\\?\\.]", "repl": " "} - - {"pattern": ",", "repl": " "} - - {"pattern": " ", "repl": " "} - - - _target_: sdp.processors.DropIfRegexMatch - output_manifest_file: ${workspace_dir}/manifest13.json - text_key: text - regex_patterns: - - "^\\s*$" - - - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest14.json - duplicate_fields: {"pred_text":"pred_text_pc"} - - - _target_: sdp.processors.SubMakeLowercase - output_manifest_file: ${workspace_dir}/manifest15.json - text_key: pred_text - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest16.json - text_key: pred_text - regex_params_list: - - {"pattern": "[\\?\\.]", "repl": " "} - - {"pattern": ",", "repl": " "} - - {"pattern": " ", "repl": " "} - - - _target_: sdp.processors.DropHighWER - output_manifest_file: ${workspace_dir}/manifest17.json - text_key: text - pred_text_key: pred_text - wer_threshold: 75 - - - _target_: sdp.processors.DropHighCER - output_manifest_file: ${workspace_dir}/manifest18.json - text_key: text - pred_text_key: pred_text - cer_threshold: 30 - - - _target_: sdp.processors.datasets.commoncrawl.AlignerSubprocess - input_manifest_file: ${workspace_dir}/manifest16.json - output_manifest_file: ${workspace_dir}/manifest19.json - input_manifest_arg: "manifest_filepath" - output_field: "alignment" - cmd: "python3 /home/nkarpov/workspace/NeMo/tools/nemo_forced_aligner/align.py pretrained_name=nvidia/stt_es_fastconformer_hybrid_large_pc \ - output_dir=${workspace_dir} batch_size=1 additional_segment_grouping_separator=|" - - - _target_: sdp.processors.datasets.commoncrawl.SplitByAligner - output_manifest_file: ${workspace_dir}/manifest20.json - splited_audio_dir: ${workspace_dir}/nfa - input_field: source_audio - output_field: nfa_filepath - - - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest21.json - duplicate_fields: {"audio_filepath":"audio_filepath_base"} - - - _target_: sdp.processors.RenameFields - output_manifest_file: ${workspace_dir}/manifest22.json - rename_fields: {"nfa_filepath":"audio_filepath"} - - - _target_: sdp.processors.DropHighLowDuration - output_manifest_file: ${workspace_dir}/manifest23.json - high_duration_threshold: 60 - low_duration_threshold: 0.01 - duration_key: nfa_duration - - - _target_: sdp.processors.ASRInference - output_manifest_file: ${workspace_dir}/manifest24.json - pretrained_model: nvidia/stt_es_fastconformer_hybrid_large_pc - batch_size: 64 - - - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest25.json - duplicate_fields: {"pred_text":"pred_text_pc"} - - - _target_: sdp.processors.SubMakeLowercase - output_manifest_file: ${workspace_dir}/manifest26.json - text_key: pred_text - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest27.json - text_key: pred_text - regex_params_list: - - {"pattern": "[\\?\\.]", "repl": " "} - - {"pattern": ",", "repl": " "} - - {"pattern": " ", "repl": " "} - - - _target_: sdp.processors.DropHighWER - output_manifest_file: ${workspace_dir}/manifest28.json - text_key: text - pred_text_key: pred_text - wer_threshold: 75 - - - _target_: sdp.processors.DropHighCER - output_manifest_file: ${workspace_dir}/manifest29.json - text_key: text - pred_text_key: pred_text - cer_threshold: 30 \ No newline at end of file diff --git a/dataset_configs/commoncrawl/big_eu.yaml b/dataset_configs/commoncrawl/big_eu.yaml deleted file mode 100644 index fc7e8e49..00000000 --- a/dataset_configs/commoncrawl/big_eu.yaml +++ /dev/null @@ -1,113 +0,0 @@ -processors_to_run: "0:" -workspace_dir: /mnt/md0/common_crawl/cc_sdp/eu - -processors: - - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue - input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8.json - output_manifest_file: ${workspace_dir}/manifest0.json - input_field: audio_lang - target_value: eu - - - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue - output_manifest_file: ${workspace_dir}/manifest1.json - input_field: text_lang - target_value: eu - - - _target_: sdp.processors.datasets.commoncrawl.ASR_HF - output_manifest_file: ${workspace_dir}/manifest2.json - pretrained_model: cahya/wav2vec2-large-xlsr-basque - output_text_field: pred_text - batch_size: 16 - - - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest3.json - duplicate_fields: {"text":"orig_text"} - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest4.json - text_key: text - regex_params_list: - - {"pattern": '\[(.*?)\]', "repl": ' '} - - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"} - - {"pattern": '‚', "repl": ","} - - {"pattern": "’", "repl": "'"} - - {"pattern": "[-–—]", "repl": " "} - - {"pattern": '―', "repl": "-"} - - {"pattern": '—', "repl": "-"} - - {"pattern": '⁺', "repl": "+"} - - {"pattern": '“', "repl": '"'} - - {"pattern": '”', "repl": '"'} - - {"pattern": '…', "repl": '.'} - - {"pattern": '‘', "repl": "'"} - - {"pattern": '′', "repl": "'"} - - {"pattern": '`', "repl": "'"} - - {"pattern": '⁻', "repl": "-"} - - {"pattern": '‑', "repl": "-"} - - {"pattern": '¶', "repl": ' '} - - {"pattern": '«', "repl": '"'} - - {"pattern": '»', "repl": '"'} - - {"pattern": '„', "repl": '"'} - - {"pattern": '®', "repl": ' '} - - {"pattern": '@', "repl": " "} - - {"pattern": " ", "repl": " "} - - - _target_: sdp.processors.DropHighLowWordrate - output_manifest_file: ${workspace_dir}/manifest5.json - high_wordrate_threshold: 100 - low_wordrate_threshold: 0.01 - - - _target_: sdp.processors.DropIfRegexMatch - output_manifest_file: ${workspace_dir}/manifest6.json - text_key: text - regex_patterns: - - "^\\s*$" - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest7.json - text_key: text - regex_params_list: - - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"} - - {"pattern": "^\\s*'*\\s*", "repl": ""} - - {"pattern": "'{2,}", "repl": "'"} - - {"pattern": '!', "repl": '.'} - - {"pattern": '\s(\\x[a-h][0-9]){1,}\s', "repl": ' '} - - {"pattern": '(\\x[a-h][0-9]){1,}', "repl": ''} - - {"pattern": '\.{3}', "repl": '.'} - - {"pattern": '\$', "repl": ""} - - {"pattern": "[^a-zA-ZóÓáÁéÉíÍñÑúÚüÜçÇ'.,?]", "repl": " "} - - {"pattern": ' ', "repl": " "} - - - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest8.json - duplicate_fields: {"text":"text_pc"} - - - _target_: sdp.processors.SubMakeLowercase - output_manifest_file: ${workspace_dir}/manifest9.json - text_key: text - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest10.json - text_key: text - regex_params_list: - - {"pattern": "[\\?\\.]", "repl": " "} - - {"pattern": ",", "repl": " "} - - {"pattern": " ", "repl": " "} - - - _target_: sdp.processors.DropIfRegexMatch - output_manifest_file: ${workspace_dir}/manifest11.json - text_key: text - regex_patterns: - - "^\\s*$" - - - _target_: sdp.processors.DropHighWER - output_manifest_file: ${workspace_dir}/manifest12.json - text_key: text - pred_text_key: pred_text - wer_threshold: 75 - - - _target_: sdp.processors.DropHighCER - output_manifest_file: ${workspace_dir}/manifest13.json - text_key: text - pred_text_key: pred_text - cer_threshold: 30 - \ No newline at end of file diff --git a/dataset_configs/commoncrawl/big_fr_en.yaml b/dataset_configs/commoncrawl/big_fr_en.yaml deleted file mode 100644 index d00548a8..00000000 --- a/dataset_configs/commoncrawl/big_fr_en.yaml +++ /dev/null @@ -1,138 +0,0 @@ -processors_to_run: "14:" -workspace_dir: /mnt/md0/common_crawl/cc_sdp/fr_en - -processors: - - _target_: sdp.processors.datasets.cc.cc.PreserveByValue - input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8ps.json - output_manifest_file: ${workspace_dir}/manifest0.json - input_field: audio_lang - target_value: fr - - - _target_: sdp.processors.datasets.cc.cc.PreserveByValue - output_manifest_file: ${workspace_dir}/manifest1.json - input_field: text_lang - target_value: en - - - _target_: sdp.processors.ASRInference - output_manifest_file: ${workspace_dir}/manifest2.json - pretrained_model: nvidia/stt_fr_fastconformer_hybrid_large_pc #stt_fr_conformer_transducer_large - batch_size: 64 - - - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest3.json - duplicate_fields: {"text":"orig_text"} - - - _target_: sdp.processors.RenameFields - output_manifest_file: ${workspace_dir}/manifest4.json - rename_fields: {"pred_text": "asr_text"} - - - _target_: sdp.processors.DropIfRegexMatch - output_manifest_file: ${workspace_dir}/manifest5.json - text_key: asr_text - regex_patterns: - - "^\\s*$" - - - _target_: sdp.processors.datasets.cc.cc.NmtSubprocess - output_manifest_file: ${workspace_dir}/manifest6.json - arg_separator: "=" - srctext_file: ${workspace_dir}/srctext.txt # --srctext=${workspace_dir}/srctext.txt - tgtout_file: ${workspace_dir}/tgtout.txt # --tgtout=${workspace_dir}/tgtout.txt - input_field: "asr_text" - output_field: "pred_text" - cmd: "python /home/nkarpov/workspace/NeMo/examples/nlp/machine_translation/nmt_transformer_infer.py \ - --model=${workspace_dir}/nmt_fr_en_transformer12x2.nemo --target_lang=en --source_lang=fr" - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifes7.json - regex_params_list: - - {"pattern": '\[(.*?)\]', "repl": ' '} - - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"} - - {"pattern": 'î', "repl": "i"} - - {"pattern": 'ì', "repl": "i"} - - {"pattern": 'í', "repl": "i"} - - {"pattern": '‚', "repl": ","} - - {"pattern": "’", "repl": "'"} - - {"pattern": "[-–—]", "repl": " "} - - {"pattern": '―', "repl": "-"} - - {"pattern": '—', "repl": "-"} - - {"pattern": '⁺', "repl": "+"} - - {"pattern": '“', "repl": '"'} - - {"pattern": '”', "repl": '"'} - - {"pattern": '…', "repl": '.'} - - {"pattern": '‘', "repl": "'"} - - {"pattern": '′', "repl": "'"} - - {"pattern": '`', "repl": "'"} - - {"pattern": '⁻', "repl": "-"} - - {"pattern": '‑', "repl": "-"} - - {"pattern": '¶', "repl": ' '} - - {"pattern": '«', "repl": '"'} - - {"pattern": '»', "repl": '"'} - - {"pattern": '„', "repl": '"'} - - {"pattern": '®', "repl": ' '} - - {"pattern": '@', "repl": " "} - - {"pattern": " ", "repl": " "} - - - _target_: sdp.processors.DropHighLowWordrate - output_manifest_file: ${workspace_dir}/manifest8.json - high_wordrate_threshold: 100 - low_wordrate_threshold: 0.01 - - - _target_: sdp.processors.datasets.cc.cc.Subprocess - output_manifest_file: ${workspace_dir}/manifest9.json - input_manifest_arg: "--manifest" - output_manifest_arg: "--output_filename" - arg_separator: "=" - cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \ - --language=en --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache \ - --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv" - # --overwrite_cache - - - _target_: sdp.processors.RenameFields - output_manifest_file: ${workspace_dir}/manifest10.json - rename_fields: {"normalized":"text"} - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest11.json - text_key: text - regex_params_list: - - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"} - - {"pattern": "^\\s*'*\\s*", "repl": ""} - - {"pattern": "'{2,}", "repl": "'"} - - {"pattern": '\\.{3}', "repl": '.'} - - {"pattern": '!', "repl": '.'} - - {"pattern": '\$', "repl": ""} - - {"pattern": "[^A-Za-zàâçéèêëîïôûùüÿæœÀÂÇÉÈÊËÎÏÔÛÙÜŸÆŒ.,?'-]", "repl": " "} - - {"pattern": ' ', "repl": " "} - test_cases: - - {input: {text: "' jupiter and venus both shining in the golden rosy sky"}, output: {text: "jupiter and venus both shining in the golden rosy sky"}} - - {input: {text: "' may all the gold i have ever dreamed of be yours '"}, output: {text: "may all the gold i have ever dreamed of be yours"}} - - {input: {text: "''cause it''s an adult novel versus ya"}, output: {text: "cause it's an adult novel versus ya"}} - - - _target_: sdp.processors.DropIfRegexMatch - output_manifest_file: ${workspace_dir}/manifest12.json - text_key: text - regex_patterns: - - "^\\s*$" - - - _target_: sdp.processors.datasets.cc.cc.BLEUScore - output_manifest_file: ${workspace_dir}/manifest13.json - ref_field: text - hyp_field: pred_text - output_field: bleu - - - _target_: sdp.processors.datasets.cc.cc.UseSonar - output_manifest_file: ${workspace_dir}/manifest14.json - input_text_field: text - input_audio_field: audio_filepath - output_field: sonar_dist - device: cuda - speech_encoder_model: sonar_speech_encoder_fra - text_encoder_model: text_sonar_basic_encoder - text_encoder_lang: eng_Latn - batch_size: 64 - - - _target_: sdp.processors.datasets.cc.cc.PreserveByValue - output_manifest_file: ${workspace_dir}/manifest15.json - input_field: bleu - target_value: 10 - operator: ge diff --git a/dataset_configs/commoncrawl/big_it.yaml b/dataset_configs/commoncrawl/big_it.yaml deleted file mode 100644 index d95e835f..00000000 --- a/dataset_configs/commoncrawl/big_it.yaml +++ /dev/null @@ -1,150 +0,0 @@ -processors_to_run: "0:" -workspace_dir: /mnt/md0/common_crawl/cc_sdp/it - -processors: - - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue - input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8.json - output_manifest_file: ${workspace_dir}/manifest0.json - input_field: audio_lang - target_value: it - - - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue - output_manifest_file: ${workspace_dir}/manifest1.json - input_field: text_lang - target_value: it - - - _target_: sdp.processors.ASRInference - output_manifest_file: ${workspace_dir}/manifest2.json - pretrained_model: nvidia/stt_it_fastconformer_hybrid_large_pc - batch_size: 64 - - - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest3.json - duplicate_fields: {"text":"orig_text"} - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest4.json - text_key: text - regex_params_list: - - {"pattern": '\[(.*?)\]', "repl": ' '} - - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"} - - {"pattern": '‚', "repl": ","} - - {"pattern": "’", "repl": "'"} - - {"pattern": "[-–—]", "repl": " "} - - {"pattern": '―', "repl": "-"} - - {"pattern": '—', "repl": "-"} - - {"pattern": '⁺', "repl": "+"} - - {"pattern": '“', "repl": '"'} - - {"pattern": '”', "repl": '"'} - - {"pattern": '…', "repl": '.'} - - {"pattern": '‘', "repl": "'"} - - {"pattern": '′', "repl": "'"} - - {"pattern": '`', "repl": "'"} - - {"pattern": '⁻', "repl": "-"} - - {"pattern": '‑', "repl": "-"} - - {"pattern": '¶', "repl": ' '} - - {"pattern": '«', "repl": '"'} - - {"pattern": '»', "repl": '"'} - - {"pattern": '„', "repl": '"'} - - {"pattern": '®', "repl": ' '} - - {"pattern": '•', "repl": " "} - - {"pattern": '●', "repl": " "} - - {"pattern": '@', "repl": " "} - - {"pattern": " ", "repl": " "} - - - _target_: sdp.processors.DropHighLowWordrate - output_manifest_file: ${workspace_dir}/manifest5.json - text_key: text - high_wordrate_threshold: 100 - low_wordrate_threshold: 0.01 - - - _target_: sdp.processors.DropIfRegexMatch - output_manifest_file: ${workspace_dir}/manifest6.json - text_key: text - regex_patterns: - - "^\\s*$" - - - _target_: sdp.processors.datasets.commoncrawl.Subprocess - output_manifest_file: ${workspace_dir}/manifest7.json - input_manifest_arg: "--manifest" - output_manifest_arg: "--output_filename" - arg_separator: "=" - cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \ - --language=es --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache \ - --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/it/data/whitelist.tsv" - # --overwrite_cache - - - _target_: sdp.processors.RenameFields - output_manifest_file: ${workspace_dir}/manifest8.json - rename_fields: {"normalized":"text"} - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest9.json - text_key: text - regex_params_list: - - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"} - - {"pattern": "^\\s*'*\\s*", "repl": ""} - - {"pattern": "'{2,}", "repl": "'"} - - {"pattern": '!', "repl": '.'} - - {"pattern": '\s(\\x[a-h][0-9]){1,}\s', "repl": ' '} - - {"pattern": '(\\x[a-h][0-9]){1,}', "repl": ''} - - {"pattern": '\.{3}', "repl": '.'} - - {"pattern": '\$', "repl": ""} - - {"pattern": "[^a-zA-ZàèéìíîòóùúÀÈÉÌÍÎÒÓÙÚ'.,?]", "repl": " "} - - {"pattern": ' ', "repl": " "} - test_cases: - - {input: {text: "' jupiter and venus both shining in the golden rosy sky"}, output: {text: "jupiter and venus both shining in the golden rosy sky"}} - - {input: {text: "' may all the gold i have ever dreamed of be yours '"}, output: {text: "may all the gold i have ever dreamed of be yours"}} - - {input: {text: "''cause it''s an adult novel versus ya"}, output: {text: "cause it's an adult novel versus ya"}} - - - - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest10.json - duplicate_fields: {"text":"text_pc"} - - - _target_: sdp.processors.SubMakeLowercase - output_manifest_file: ${workspace_dir}/manifest11.json - text_key: text - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest12.json - text_key: text - regex_params_list: - - {"pattern": "[\\?\\.]", "repl": " "} - - {"pattern": ",", "repl": " "} - - {"pattern": " ", "repl": " "} - - - _target_: sdp.processors.DropIfRegexMatch - output_manifest_file: ${workspace_dir}/manifest13.json - text_key: text - regex_patterns: - - "^\\s*$" - - - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest14.json - duplicate_fields: {"pred_text":"pred_text_pc"} - - - _target_: sdp.processors.SubMakeLowercase - output_manifest_file: ${workspace_dir}/manifest15.json - text_key: pred_text - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest16.json - text_key: pred_text - regex_params_list: - - {"pattern": "[\\?\\.]", "repl": " "} - - {"pattern": ",", "repl": " "} - - {"pattern": " ", "repl": " "} - - - _target_: sdp.processors.DropHighWER - output_manifest_file: ${workspace_dir}/manifest17.json - text_key: text - pred_text_key: pred_text - wer_threshold: 75 - - - _target_: sdp.processors.DropHighCER - output_manifest_file: ${workspace_dir}/manifest18.json - text_key: text - pred_text_key: pred_text - cer_threshold: 30 - \ No newline at end of file diff --git a/dataset_configs/commoncrawl/big_nl.yaml b/dataset_configs/commoncrawl/big_nl.yaml deleted file mode 100644 index 254b1694..00000000 --- a/dataset_configs/commoncrawl/big_nl.yaml +++ /dev/null @@ -1,128 +0,0 @@ -processors_to_run: "0:" -workspace_dir: /mnt/md0/common_crawl/cc_sdp/nl - -processors: - - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue - input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8.json - output_manifest_file: ${workspace_dir}/manifest0.json - input_field: audio_lang - target_value: nl - - - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue - output_manifest_file: ${workspace_dir}/manifest1.json - input_field: text_lang - target_value: nl - - - _target_: sdp.processors.datasets.commoncrawl.ASR_HF - output_manifest_file: ${workspace_dir}/manifest2.json - pretrained_model: jonatasgrosman/wav2vec2-large-xlsr-53-dutch - output_text_field: pred_text - batch_size: 16 - - - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest3.json - duplicate_fields: {"text":"orig_text"} - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest4.json - text_key: text - regex_params_list: - - {"pattern": '\[(.*?)\]', "repl": ' '} - - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"} - - {"pattern": 'î', "repl": "i"} - - {"pattern": 'ì', "repl": "i"} - - {"pattern": 'è', "repl": "e"} - - {"pattern": 'È', "repl": "E"} - - {"pattern": 'ù', "repl": "u"} - - {"pattern": 'ò', "repl": "o"} - - {"pattern": 'à', "repl": "a"} - - {"pattern": '‚', "repl": ","} - - {"pattern": "’", "repl": "'"} - - {"pattern": "[-–—]", "repl": " "} - - {"pattern": '―', "repl": "-"} - - {"pattern": '—', "repl": "-"} - - {"pattern": '⁺', "repl": "+"} - - {"pattern": '“', "repl": '"'} - - {"pattern": '”', "repl": '"'} - - {"pattern": '…', "repl": '.'} - - {"pattern": '‘', "repl": "'"} - - {"pattern": '′', "repl": "'"} - - {"pattern": '`', "repl": "'"} - - {"pattern": '⁻', "repl": "-"} - - {"pattern": '‑', "repl": "-"} - - {"pattern": '¶', "repl": ' '} - - {"pattern": '«', "repl": '"'} - - {"pattern": '»', "repl": '"'} - - {"pattern": '„', "repl": '"'} - - {"pattern": '®', "repl": ' '} - - {"pattern": '•', "repl": " "} - - {"pattern": '●', "repl": " "} - - {"pattern": '@', "repl": " "} - - {"pattern": " ", "repl": " "} - - - _target_: sdp.processors.DropHighLowWordrate - output_manifest_file: ${workspace_dir}/manifest5.json - text_key: text - high_wordrate_threshold: 100 - low_wordrate_threshold: 0.01 - - - _target_: sdp.processors.DropIfRegexMatch - output_manifest_file: ${workspace_dir}/manifest6.json - text_key: text - regex_patterns: - - "^\\s*$" - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest7.json - text_key: text - regex_params_list: - - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"} - - {"pattern": "^\\s*'*\\s*", "repl": ""} - - {"pattern": "'{2,}", "repl": "'"} - - {"pattern": '!', "repl": '.'} - - {"pattern": '\s(\\x[a-h][0-9]){1,}\s', "repl": ' '} - - {"pattern": '(\\x[a-h][0-9]){1,}', "repl": ''} - - {"pattern": '\.{3}', "repl": '.'} - - {"pattern": '\$', "repl": ""} - - {"pattern": "[^a-zA-ZóÓáÁéÉíÍúÚöÖäÄëËïÏüÜ'.,?]", "repl": " "} - - {"pattern": ' ', "repl": " "} - test_cases: - - {input: {text: "' jupiter and venus both shining in the golden rosy sky"}, output: {text: "jupiter and venus both shining in the golden rosy sky"}} - - {input: {text: "' may all the gold i have ever dreamed of be yours '"}, output: {text: "may all the gold i have ever dreamed of be yours"}} - - {input: {text: "''cause it''s an adult novel versus ya"}, output: {text: "cause it's an adult novel versus ya"}} - - - - _target_: sdp.processors.DuplicateFields - output_manifest_file: ${workspace_dir}/manifest8.json - duplicate_fields: {"text":"text_pc"} - - - _target_: sdp.processors.SubMakeLowercase - output_manifest_file: ${workspace_dir}/manifest9.json - text_key: text - - - _target_: sdp.processors.SubRegex - output_manifest_file: ${workspace_dir}/manifest10.json - text_key: text - regex_params_list: - - {"pattern": "[\\?\\.]", "repl": " "} - - {"pattern": ",", "repl": " "} - - {"pattern": " ", "repl": " "} - - - _target_: sdp.processors.DropIfRegexMatch - output_manifest_file: ${workspace_dir}/manifest11.json - text_key: text - regex_patterns: - - "^\\s*$" - - - _target_: sdp.processors.DropHighWER - output_manifest_file: ${workspace_dir}/manifest12.json - text_key: text - pred_text_key: pred_text - wer_threshold: 75 - - - _target_: sdp.processors.DropHighCER - output_manifest_file: ${workspace_dir}/manifest13.json - text_key: text - pred_text_key: pred_text - cer_threshold: 30 - \ No newline at end of file