Skip to content

Instantly share code, notes, and snippets.

@albertz
Last active February 22, 2022 13:23
Show Gist options
  • Save albertz/8429f7ad06bf32832b7c6ff227598982 to your computer and use it in GitHub Desktop.
Save albertz/8429f7ad06bf32832b7c6ff227598982 to your computer and use it in GitHub Desktop.
class OldToNewTranscriptions(Job):
def __init__(self, input_txt: Path):
super(OldToNewTranscriptions, self).__init__()
self.input_txt = input_txt
self.output_txt = self.output_path("output.txt.gz")
def run(self):
import re
from recipe.utils import generic_open
input_transcriptions = eval(generic_open(self.input_txt.get_path()).read())
print("Create file:", self.output_txt.get_path())
count = 0
with generic_open(self.output_txt.get_path(), "w") as out:
out.write("{\n")
for seq_tag, txt in sorted(input_transcriptions.items()):
# old seq tag 'dev-other-116-288045-0000'
# new seq tag 'LibriSpeech/dev-other/3663/172528/3663-172528-0000.flac.ogg'
assert isinstance(txt, str)
m = re.match(r"([a-z\-]+)-([0-9]+)-([0-9]+)-([0-9]+)", seq_tag)
seq_tag = f"LibriSpeech/{m.group(1)}/{m.group(2)}/{m.group(3)}/{m.group(2)}-{m.group(3)}-{m.group(4)}.flac.ogg"
out.write("%r: %r,\n" % (seq_tag, txt))
count += 1
out.write("}\n")
print("Num seqs:", count)
def tasks(self):
yield Task('run', rqmt={'mem': 1, 'time': 10}, mini_task=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment