Skip to content

Commit

Permalink
remove redundant manifest filepaths from spanish configs and update d…
Browse files Browse the repository at this point in the history
…ocs about how linking works

Signed-off-by: Elena Rastorgueva <[email protected]>
  • Loading branch information
erastorgueva-nv committed Nov 9, 2023
1 parent 39a3be3 commit d3dd8ae
Show file tree
Hide file tree
Showing 6 changed files with 11 additions and 103 deletions.
2 changes: 0 additions & 2 deletions dataset_configs/spanish/mls/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,10 @@ processors:
data_split: "${data_split}"

- _target_: sdp.processors.ASRInference
input_manifest_file: "${workspace_dir}/mls_spanish_processed/${data_split}_manifest.json"
output_manifest_file: "${workspace_dir}/processed_manifests/stt_es_quartznet15x5_${data_split}.json"
pretrained_model: "stt_es_quartznet15x5"

- _target_: sdp.processors.SubRegex
input_manifest_file: "${workspace_dir}/processed_manifests/stt_es_quartznet15x5_${data_split}.json"
regex_params_list:
- {"pattern": "'", "repl": ""} # so that e.g. "d'artagnan" becomes "dartagnan", not "d artagnan"

Expand Down
21 changes: 0 additions & 21 deletions dataset_configs/spanish_pc/fisher/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,19 +24,14 @@ processors:
path_to_sph2pipe: ${path_to_sph2pipe}

- _target_: sdp.processors.SortManifest
input_manifest_file: "${processed_manifest_dir}/all_initial_manifest.json"
output_manifest_file: "${processed_manifest_dir}/all_sorted_manifest.json"
attribute_sort_by: "duration"
descending: false

- _target_: sdp.processors.ASRInference
input_manifest_file: "${processed_manifest_dir}/all_sorted_manifest.json"
output_manifest_file: "${processed_manifest_dir}/all_transcribed.json"
pretrained_model: "stt_es_citrinet_512"

- _target_: sdp.processors.SubRegex
input_manifest_file: "${processed_manifest_dir}/all_transcribed.json"
output_manifest_file: "${processed_manifest_dir}/all_sub_hesitations.json"
regex_params_list:
- {"pattern": "\\s(\\S+-\\s)+", "repl": " "}

Expand All @@ -48,8 +43,6 @@ processors:
- {input: {text: "abc y yes"}, output: {text: "abc y yes"} }

- _target_: sdp.processors.SubRegex
input_manifest_file: "${processed_manifest_dir}/all_sub_hesitations.json"
output_manifest_file: "${processed_manifest_dir}/all_sub_to_spaces.json"

regex_params_list:
- {"pattern": "!", "repl": "."}
Expand All @@ -73,8 +66,6 @@ processors:
- {input: {text: "abc- def."}, output: {text: "abc def."}}

- _target_: sdp.processors.DropIfRegexMatch
input_manifest_file: "${processed_manifest_dir}/all_sub_to_spaces.json"
output_manifest_file: "${processed_manifest_dir}/all_drop_brackets.json"
regex_patterns: [
"\\(\\)", "\\(\\(", "\\)\\)", "<foreign", "</foreign", "< foreign" ,
"<lname",
Expand All @@ -84,8 +75,6 @@ processors:
- {input: {text: "abcdef"}, output: {text: "abcdef"}}

- _target_: sdp.processors.SubRegex
input_manifest_file: "${processed_manifest_dir}/all_drop_brackets.json"
output_manifest_file: "${processed_manifest_dir}/all_sub_typos.json"
regex_params_list:
- {"pattern": "à", "repl" : "pa"} # typo, infrequent
- {"pattern": "è", "repl" : "pe"} # typo, infrequent
Expand All @@ -95,26 +84,19 @@ processors:
- {input: {text: "@ abc"}, output: {text: "arroba abc"}}

- _target_: sdp.processors.DropIfRegexMatch
input_manifest_file: "${processed_manifest_dir}/all_sub_typos.json"
output_manifest_file: "${processed_manifest_dir}/all_drop_empty.json"
regex_patterns: ["^\\s*$"]
test_cases:
- {input: {text: ""}, output: null}
- {input: {text: " "}, output: null}
- {input: {text: "abcdef"}, output: {text: "abcdef"}}

- _target_: sdp.processors.DropNonAlphabet
input_manifest_file: "${processed_manifest_dir}/all_drop_empty.json"
output_manifest_file: "${processed_manifest_dir}/all_only_alphabet.json"

alphabet: " abcdefghijklmnopqrstuvwxyzáéíñóúüABCDEFGHIJKLMNOPQRSTUVWXYZÁÉÍÑÓÚÜ,.?¿"
test_cases:
- {input: {text: "test тест 测试"}, output: null}
- {input: {text: "test"}, output: {text: "test"}}

- _target_: sdp.processors.DropHighLowDuration
input_manifest_file: "${processed_manifest_dir}/all_only_alphabet.json"
output_manifest_file: "${processed_manifest_dir}/all_drop_duration.json"
high_duration_threshold: 20
low_duration_threshold: 1 # there are about 6 hours of utterances less than 1 second (short phrases like "mhm", "claro", "y tu"...)
test_cases:
Expand All @@ -123,8 +105,6 @@ processors:
- {input: {duration: 5}, output: {duration: 5}}

- _target_: sdp.processors.SubRegex
input_manifest_file: "${processed_manifest_dir}/all_drop_duration.json"
output_manifest_file: "${processed_manifest_dir}/all_punc_spaces.json"
regex_params_list:
- {"pattern": '\.', "repl": ' . '}
- {"pattern": ",", "repl": " , "}
Expand All @@ -135,7 +115,6 @@ processors:
- {input: {text: "¿abc? def."}, output: {text: "¿ abc ? def ."}}

- _target_: sdp.processors.AddConstantFields
input_manifest_file: "${workspace_dir}/manifests/all_punc_spaces.json"
output_manifest_file: "${workspace_dir}/manifests/all_cleaned.json"
fields: {"text_pc_origin": "original"}

Expand Down
39 changes: 0 additions & 39 deletions dataset_configs/spanish_pc/mls/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ processors:
data_split: "${data_split}"

- _target_: sdp.processors.RestorePCForMLS
input_manifest_file: "${workspace_dir}/mls_spanish_processed/${data_split}_manifest_initial.json"
output_manifest_file: "${workspace_dir}/mls_spanish_processed/${data_split}_manifest_pc.json"
language_long: ${language_long}
language_short: ${language_short}
Expand All @@ -42,7 +41,6 @@ processors:
show_conversion_breakdown: true

- _target_: sdp.processors.PCInference
input_manifest_file: "${workspace_dir}/mls_spanish_processed/${data_split}_manifest_pc.json"
output_manifest_file: "${workspace_dir}/mls_spanish_processed/${data_split}_manifest_pc_just_added_pred.json"
model_path: "${pc_model_path}"
input_text_field: "text"
Expand All @@ -51,8 +49,6 @@ processors:

# fix up text_pc_pred outputs by putting upside down question mark at the start of the word (not at the end, as it currently is)
- _target_: sdp.processors.SubRegex
input_manifest_file: "${workspace_dir}/mls_spanish_processed/${data_split}_manifest_pc_just_added_pred.json"
output_manifest_file: "${workspace_dir}/mls_spanish_processed/${data_split}_manifest_pc_with_pred_a.json"
text_key: "text_pc_pred"
regex_params_list:
- {"pattern": ' (\w+)¿ ', "repl": ' ¿\1 '}
Expand All @@ -61,15 +57,11 @@ processors:
- {input: {text_pc_pred: "cómo¿ estás? def."}, output: {text_pc_pred: "¿cómo estás? def."}}

- _target_: sdp.processors.MakeLettersUppercaseAfterPeriod
input_manifest_file: "${workspace_dir}/mls_spanish_processed/${data_split}_manifest_pc_with_pred_a.json"
output_manifest_file: "${workspace_dir}/mls_spanish_processed/${data_split}_manifest_pc_with_pred_b.json"
text_key: "text_pc_pred"
test_cases:
- {input: {text_pc_pred: "¿cómo estás? def."}, output: {text_pc_pred: "¿cómo estás? Def."}}

- _target_: sdp.processors.RenameFields
input_manifest_file: "${workspace_dir}/mls_spanish_processed/${data_split}_manifest_pc_with_pred_b.json"
output_manifest_file: "${workspace_dir}/mls_spanish_processed/${data_split}_manifest_pc_with_pred_c.json"
rename_fields:
"text": "original_dataset_text"

Expand All @@ -80,17 +72,12 @@ processors:
- field: text_pc_pred
origin_label: synthetic
target: text
input_manifest_file: "${workspace_dir}/mls_spanish_processed/${data_split}_manifest_pc_with_pred_c.json"
output_manifest_file: "${workspace_dir}/mls_spanish_processed/${data_split}_manifest_pc_with_pred.json"

- _target_: sdp.processors.ASRInference
input_manifest_file: "${workspace_dir}/mls_spanish_processed/${data_split}_manifest_pc_with_pred.json"
output_manifest_file: "${workspace_dir}/mls_spanish_processed/${data_split}_manifest_pc_with_pred_inference.json"
pretrained_model: "stt_es_quartznet15x5"

- _target_: sdp.processors.SubRegex
input_manifest_file: "${workspace_dir}/mls_spanish_processed/${data_split}_manifest_pc_with_pred_inference.json"
output_manifest_file: "${workspace_dir}/mls_spanish_processed/${data_split}_manifest_pc_sub_regex_a.json"
regex_params_list:
- {"pattern": "!", "repl": "."}
- {"pattern": "…", "repl": "."}
Expand Down Expand Up @@ -185,8 +172,6 @@ processors:

# bunch of strings we map to spaces
- _target_: sdp.processors.SubRegex
input_manifest_file: "${workspace_dir}/mls_spanish_processed/${data_split}_manifest_pc_sub_regex_a.json"
output_manifest_file: "${workspace_dir}/mls_spanish_processed/${data_split}_manifest_pc_sub_regex.json"
regex_params_list:
- {"pattern": '[\-\‐\‑\–\—\―\"]', "repl": " "}
- {"pattern": "'", "repl": " "}
Expand Down Expand Up @@ -223,8 +208,6 @@ processors:
- {input: {text: '"abc"'}, output: {text: "abc"}}

- _target_: sdp.processors.DropNonAlphabet
input_manifest_file: "${workspace_dir}/mls_spanish_processed/${data_split}_manifest_pc_sub_regex.json"
output_manifest_file: "${workspace_dir}/mls_spanish_processed/${data_split}_manifest_only_alphabet.json"
alphabet: " abcdefghijklmnopqrstuvwxyzáéíñóúüABCDEFGHIJKLMNOPQRSTUVWXYZÁÉÍÑÓÚÜ,.?¿"
test_cases:
- {input: {text: "test тест 测试"}, output: null}
Expand All @@ -233,8 +216,6 @@ processors:

# run manual normalization of roman numerals
- _target_: dataset_configs.spanish.mls.unique_processors.clean_roman_numerals.CleanRomanNumerals
input_manifest_file: "${workspace_dir}/mls_spanish_processed/${data_split}_manifest_only_alphabet.json"
output_manifest_file: "${workspace_dir}/mls_spanish_processed/${data_split}_manifest_normalized.json"
king_triggers: [
"alfonso",
"benedicto",
Expand Down Expand Up @@ -276,8 +257,6 @@ processors:

# drop spaced out regex
- _target_: sdp.processors.DropIfRegexMatch
input_manifest_file: "${workspace_dir}/mls_spanish_processed/${data_split}_manifest_normalized.json"
output_manifest_file: "${workspace_dir}/mls_spanish_processed/${data_split}_manifest_drop_spaced_out.json"
regex_patterns:
- '(\D ){5,20}'

Expand All @@ -289,8 +268,6 @@ processors:

# drop bad books
- _target_: sdp.processors.DropIfRegexMatch
input_manifest_file: "${workspace_dir}/mls_spanish_processed/${data_split}_manifest_drop_spaced_out.json"
output_manifest_file: "${workspace_dir}/mls_spanish_processed/${data_split}_manifest_drop_bad_books.json"
text_key: "audio_filepath"
regex_patterns: [
# books with lots of OCR errors etc.
Expand Down Expand Up @@ -329,8 +306,6 @@ processors:


- _target_: sdp.processors.DropIfRegexMatch
input_manifest_file: "${workspace_dir}/mls_spanish_processed/${data_split}_manifest_drop_bad_books.json"
output_manifest_file: "${workspace_dir}/mls_spanish_processed/${data_split}_manifest_drop_librivox.json"
text_key: "pred_text"
regex_patterns: [
'librewox', 'librevox', 'librivox', 'libribox', 'libriebox', 'libriboux',
Expand All @@ -342,19 +317,13 @@ processors:


- _target_: sdp.processors.DuplicateFields
input_manifest_file: "${workspace_dir}/mls_spanish_processed/${data_split}_manifest_drop_librivox.json"
output_manifest_file: "${workspace_dir}/mls_spanish_processed/${data_split}_manifest_make_text_no_pc_a.json"
duplicate_fields:
"text": "text_no_pc"

- _target_: sdp.processors.SubMakeLowercase
input_manifest_file: "${workspace_dir}/mls_spanish_processed/${data_split}_manifest_make_text_no_pc_a.json"
output_manifest_file: "${workspace_dir}/mls_spanish_processed/${data_split}_manifest_make_text_no_pc_b.json"
text_key: "text_no_pc"

- _target_: sdp.processors.SubRegex
input_manifest_file: "${workspace_dir}/mls_spanish_processed/${data_split}_manifest_make_text_no_pc_b.json"
output_manifest_file: "${workspace_dir}/mls_spanish_processed/${data_split}_manifest_make_text_no_pc.json"
text_key: "text_no_pc"
regex_params_list:
- {"pattern": '[¿?.,]', "repl": ""}
Expand All @@ -377,8 +346,6 @@ processors:


- _target_: sdp.processors.DropASRErrorBeginningEnd
input_manifest_file: "${workspace_dir}/mls_spanish_processed/${data_split}_manifest_drop_insertion.json"
output_manifest_file: "${workspace_dir}/mls_spanish_processed/${data_split}_manifest_drop_asr_error.json"
text_key: "text_no_pc"
beginning_error_char_threshold: 10
end_error_char_threshold: 10
Expand All @@ -390,17 +357,13 @@ processors:

- _target_: sdp.processors.DropHighWER
text_key: "text_no_pc"
input_manifest_file: "${workspace_dir}/mls_spanish_processed/${data_split}_manifest_drop_asr_error.json"
output_manifest_file: "${workspace_dir}/mls_spanish_processed/${data_split}_manifest_drop_wer.json"
wer_threshold: 90
test_cases:
- {input: {text_no_pc: "sí hola", pred_text: "abcdefgh abcdefgh"}, output: null}
- {input: {text_no_pc: "sí hola", pred_text: "sí hola"}, output: {text_no_pc: "sí hola", pred_text: "sí hola"}}

- _target_: sdp.processors.DropHighCER
text_key: "text_no_pc"
input_manifest_file: "${workspace_dir}/mls_spanish_processed/${data_split}_manifest_drop_wer.json"
output_manifest_file: "${workspace_dir}/mls_spanish_processed/${data_split}_manifest_drop_cer.json"
cer_threshold: 90
test_cases:
- {input: {text_no_pc: "sí hola", pred_text: "abcdefgh abcdefgh"}, output: null}
Expand All @@ -419,7 +382,6 @@ processors:
- {input: {text: "buenos dias", duration: 1}, output: {text: "buenos dias", duration: 1}}

- _target_: sdp.processors.SubRegex
input_manifest_file: "${workspace_dir}/mls_spanish_processed/${data_split}_manifest_drop_charrate.json"
output_manifest_file: "${workspace_dir}/mls_spanish_processed/${data_split}_cleaned.json"
regex_params_list:
- {"pattern": '\.', "repl": ' . '}
Expand All @@ -431,7 +393,6 @@ processors:
- {input: {text: "¿abc? def."}, output: {text: "¿ abc ? def ."}}

- _target_: sdp.processors.KeepOnlySpecifiedFields
input_manifest_file: "${workspace_dir}/mls_spanish_processed/${data_split}_cleaned.json"
output_manifest_file: "${final_manifest}"
fields_to_keep:
- "audio_filepath"
Expand Down
Loading

0 comments on commit d3dd8ae

Please sign in to comment.