Module vtt_to_srt

Convert of vtt to srt format

Expand source code
#!/usr/bin/python
# Jansen A. Simanullang / Jeison Cardoso

"""Convert of vtt to srt format"""

import os
import re
import argparse
from string import Template
from stat import S_ISDIR, ST_MODE, S_ISREG


class VttToStr:
    """Convert vtt to srt"""

    def __init__(self) -> None:
        pass

    def convert_header(self, contents):
        """Convert of vtt header to srt format

        :contents -- contents of vtt file
        """
        replacement = re.sub(r"WEBVTT\n", "", contents)
        replacement = re.sub(r"Kind:[ \-\w]+\n", "", replacement)
        replacement = re.sub(r"Language:[ \-\w]+\n", "", replacement)
        return replacement

    def add_padding_to_timestamp(self, contents):
        """Add 00 to padding timestamp of to srt format

        :contents -- contents of vtt file
        """
        find_srt = Template(r'$a,$b --> $a,$b(?:[ \-\w]+:[\w\%\d:,.]+)*\n')
        minute = r"((?:\d\d:){1}\d\d)"
        second = r"((?:\d\d:){0}\d\d)"
        padding_minute = find_srt.substitute(a=minute, b=r"(\d{0,3})")
        padding_second = find_srt.substitute(a=second, b=r"(\d{0,3})")
        replacement = re.sub(
            padding_minute, r"00:\1,\2 --> 00:\3,\4\n", contents)
        return re.sub(padding_second, r"00:00:\1,\2 --> 00:00:\3,\4\n", replacement)

    def convert_timestamp(self, contents):
        """Convert timestamp of vtt file to srt format

        :contents -- contents of vtt file
        """
        find_vtt = Template(r'$a.$b --> $a.$b(?:[ \-\w]+:[\w\%\d:,.]+)*\n')
        all_timestamp = find_vtt.substitute(
            a=r"((?:\d\d:){0,2}\d\d)", b=r"(\d{0,3})")
        return self.add_padding_to_timestamp(re.sub(all_timestamp, r"\1,\2 --> \3,\4\n", contents))

    def convert_content(self, contents):
        """Convert content of vtt file to srt format

        :contents -- contents of vtt file
        """
        replacement = self.convert_timestamp(contents)
        replacement = self.convert_header(replacement)
        replacement = re.sub(r"<c[.\w\d]*>", "", replacement)
        replacement = re.sub(r"</c>", "", replacement)
        replacement = re.sub(r"<\d\d:\d\d:\d\d.\d\d\d>", "", replacement)
        replacement = re.sub(
            r"::[\-\w]+\([\-.\w\d]+\)[ ]*{[.,:;\(\) \-\w\d]+\n }\n", "", replacement)
        replacement = re.sub(r"Style:\n##\n", "", replacement)
        replacement = self.add_sequence_numbers(replacement)

        return replacement

    def has_timestamp(self, content):
        """Check if line is a timestamp srt format

        :contents -- contents of vtt file
        """
        return re.match(r"((\d\d:){2}\d\d),(\d{3}) --> ((\d\d:){2}\d\d),(\d{3})", content) is not None

    def add_sequence_numbers(self, contents):
        """Adds sequence numbers to subtitle contents and returns new subtitle contents

        :contents -- contents of vtt file
        """
        output = ''
        lines = contents.split('\n')
        i = 1
        for line in lines:
            if self.has_timestamp(line):
                output += str(i) + '\n'
                i += 1
            output += line + '\n'
        return output

    def write_file(self, filename: str, data, encoding_format: str = "utf-8"):
        """Create a file with some data

        :filename -- filename pat
        :data -- data to write
        :encoding_format -- encoding format
        """
        try:
            with open(filename, "w", encoding=encoding_format) as file:
                file.writelines(str(data))
        except IOError:
            filename = filename.split(os.sep)[-1]
            with open(filename, "w", encoding=encoding_format) as file:
                file.writelines(str(data))
        print(f"file created {filename}\n")

    def read_file(self, filename: str, encoding_format: str = "utf-8"):
        """Read a file text

        :filename -- filename path
        :encoding_format -- encoding format
        """
        content: str = ''
        with open(filename, mode="r", encoding=encoding_format) as file:
            print(f"file being read: {filename}\n")
            content = file.read()

        return content

    def process(self, filename: str, encoding_format: str = "utf-8"):
        """Convert vtt file to a srt file

        :str_name_file -- filename path
        :encoding_format -- encoding format
        """
        file_contents: str = self.read_file(filename, encoding_format)
        str_data: str = ""
        str_data = str_data + self.convert_content(file_contents)
        filename = filename.replace(".vtt", ".srt")
        self.write_file(filename, str_data, encoding_format)


class ConvertFile:
    """Convert vtt file to srt file"""

    def __init__(self, pathname: str, encoding_format: str):
        """Constructor

        :pathname -- path to file or directory
        :encoding_format -- encoding format
        """
        self.pathname = pathname
        self.encoding_format = encoding_format
        self.vtt_to_str = VttToStr()

    def convert(self):
        """Convert vtt file to srt file"""
        if ".vtt" in self.pathname:
            self.vtt_to_str.process(self.pathname, self.encoding_format)


class ConvertDirectories:
    """Convert vtt files to srt files"""

    def __init__(self, pathname: str, enable_recursive: bool, encoding_format: str):
        """Constructor

        pathname -- path to file or directory
        :enable_recursive -- enable recursive
        :encoding_format -- encoding format
        """
        self.pathname = pathname
        self.enable_recursive = enable_recursive
        self.encoding_format = encoding_format
        self.vtt_to_str = VttToStr()

    def _walk_dir(self, top_most_path: str, callback):
        """Walk a directory

        :top_most_path -- parent directory
        :callback -- function to call
        """
        for file in os.listdir(top_most_path):
            pathname = os.path.join(top_most_path, file)
            if not os.path.isdir(pathname):
                # It"s a file, call the callback function
                callback(pathname)

    def _walk_tree(self, top_most_path, callback):
        """Recursively descend the directory tree rooted at top_most_path,
        calling the callback function for each regular file

        :top_most_path -- parent directory
        :callback -- function to call
        """
        for file in os.listdir(top_most_path):
            pathname = os.path.join(top_most_path, file)
            mode = os.stat(pathname)[ST_MODE]
            if S_ISDIR(mode):
                # It's a directory, recurse into it
                self._walk_tree(pathname, callback)
            elif S_ISREG(mode):
                # It's a file, call the callback function
                callback(pathname)
            else:
                # Unknown file type, print a message
                print(f"Skipping {pathname}")

    def convert_vtt_to_str(self, file: str):
        """Convert vtt file to string

        :file -- file to convert
        """
        if ".vtt" in file:
            try:
                self.vtt_to_str.process(file, self.encoding_format)
            except UnicodeDecodeError:
                print(f"UnicodeDecodeError: {file}")

    def _vtt_to_srt_batch(self, directory: str):
        """Walk down directory searching for vtt files

        :directory -- path to search
        """
        top_most_path = directory
        if self.enable_recursive:
            self._walk_tree(top_most_path, self.convert_vtt_to_str)
        else:
            self._walk_dir(top_most_path, self.convert_vtt_to_str)

    def convert(self):
        """Convert vtt files to srt files"""
        self._vtt_to_srt_batch(self.pathname)


def _show_usage():
    """Show a info message about the usage"""
    print("\nUsage:\tvtt_to_srt pathname [-r]\n")
    print("\tpathname\t- a file or directory with files to be converted")
    print("\t-r\t\t- walk path recursively\n")


def _parse_args():
    """Parse command line arguments"""
    parser = argparse.ArgumentParser(
        description='Convert vtt files to srt files')
    parser.add_argument(
        "pathname", help="a file or directory with files to be converted")
    parser.add_argument("-r", "--recursive",
                        help="walk path recursively", action="store_true")
    parser.add_argument("-e", "--encoding",
                        help="encoding format for input and output files")

    args = parser.parse_args()
    return args


def main():
    """main function"""

    args = _parse_args()
    pathname = args.pathname
    recursive = args.recursive
    encoding = args.encoding

    if not encoding:
        encoding = "utf-8"

    if os.path.isfile(pathname):
        print(f"file being converted: {pathname}\n")
        ConvertFile(pathname, encoding).convert()

    if os.path.isdir(pathname):
        print(f"directory being converted: {pathname}\n")
        ConvertDirectories(pathname, recursive, encoding).convert()

    if not os.path.isfile(pathname) and not os.path.isdir(pathname):
        print(f"pathname is not a file or directory: {pathname}\n")
        _show_usage()


if __name__ == "__main__":
    main()

Functions

def main()

main function

Expand source code
def main():
    """main function"""

    args = _parse_args()
    pathname = args.pathname
    recursive = args.recursive
    encoding = args.encoding

    if not encoding:
        encoding = "utf-8"

    if os.path.isfile(pathname):
        print(f"file being converted: {pathname}\n")
        ConvertFile(pathname, encoding).convert()

    if os.path.isdir(pathname):
        print(f"directory being converted: {pathname}\n")
        ConvertDirectories(pathname, recursive, encoding).convert()

    if not os.path.isfile(pathname) and not os.path.isdir(pathname):
        print(f"pathname is not a file or directory: {pathname}\n")
        _show_usage()

Classes

class ConvertDirectories (pathname: str, enable_recursive: bool, encoding_format: str)

Convert vtt files to srt files

Constructor

pathname – path to file or directory :enable_recursive – enable recursive :encoding_format – encoding format

Expand source code
class ConvertDirectories:
    """Convert vtt files to srt files"""

    def __init__(self, pathname: str, enable_recursive: bool, encoding_format: str):
        """Constructor

        pathname -- path to file or directory
        :enable_recursive -- enable recursive
        :encoding_format -- encoding format
        """
        self.pathname = pathname
        self.enable_recursive = enable_recursive
        self.encoding_format = encoding_format
        self.vtt_to_str = VttToStr()

    def _walk_dir(self, top_most_path: str, callback):
        """Walk a directory

        :top_most_path -- parent directory
        :callback -- function to call
        """
        for file in os.listdir(top_most_path):
            pathname = os.path.join(top_most_path, file)
            if not os.path.isdir(pathname):
                # It"s a file, call the callback function
                callback(pathname)

    def _walk_tree(self, top_most_path, callback):
        """Recursively descend the directory tree rooted at top_most_path,
        calling the callback function for each regular file

        :top_most_path -- parent directory
        :callback -- function to call
        """
        for file in os.listdir(top_most_path):
            pathname = os.path.join(top_most_path, file)
            mode = os.stat(pathname)[ST_MODE]
            if S_ISDIR(mode):
                # It's a directory, recurse into it
                self._walk_tree(pathname, callback)
            elif S_ISREG(mode):
                # It's a file, call the callback function
                callback(pathname)
            else:
                # Unknown file type, print a message
                print(f"Skipping {pathname}")

    def convert_vtt_to_str(self, file: str):
        """Convert vtt file to string

        :file -- file to convert
        """
        if ".vtt" in file:
            try:
                self.vtt_to_str.process(file, self.encoding_format)
            except UnicodeDecodeError:
                print(f"UnicodeDecodeError: {file}")

    def _vtt_to_srt_batch(self, directory: str):
        """Walk down directory searching for vtt files

        :directory -- path to search
        """
        top_most_path = directory
        if self.enable_recursive:
            self._walk_tree(top_most_path, self.convert_vtt_to_str)
        else:
            self._walk_dir(top_most_path, self.convert_vtt_to_str)

    def convert(self):
        """Convert vtt files to srt files"""
        self._vtt_to_srt_batch(self.pathname)

Methods

def convert(self)

Convert vtt files to srt files

Expand source code
def convert(self):
    """Convert vtt files to srt files"""
    self._vtt_to_srt_batch(self.pathname)
def convert_vtt_to_str(self, file: str)

Convert vtt file to string

:file – file to convert

Expand source code
def convert_vtt_to_str(self, file: str):
    """Convert vtt file to string

    :file -- file to convert
    """
    if ".vtt" in file:
        try:
            self.vtt_to_str.process(file, self.encoding_format)
        except UnicodeDecodeError:
            print(f"UnicodeDecodeError: {file}")
class ConvertFile (pathname: str, encoding_format: str)

Convert vtt file to srt file

Constructor

:pathname – path to file or directory :encoding_format – encoding format

Expand source code
class ConvertFile:
    """Convert vtt file to srt file"""

    def __init__(self, pathname: str, encoding_format: str):
        """Constructor

        :pathname -- path to file or directory
        :encoding_format -- encoding format
        """
        self.pathname = pathname
        self.encoding_format = encoding_format
        self.vtt_to_str = VttToStr()

    def convert(self):
        """Convert vtt file to srt file"""
        if ".vtt" in self.pathname:
            self.vtt_to_str.process(self.pathname, self.encoding_format)

Methods

def convert(self)

Convert vtt file to srt file

Expand source code
def convert(self):
    """Convert vtt file to srt file"""
    if ".vtt" in self.pathname:
        self.vtt_to_str.process(self.pathname, self.encoding_format)
class VttToStr

Convert vtt to srt

Expand source code
class VttToStr:
    """Convert vtt to srt"""

    def __init__(self) -> None:
        pass

    def convert_header(self, contents):
        """Convert of vtt header to srt format

        :contents -- contents of vtt file
        """
        replacement = re.sub(r"WEBVTT\n", "", contents)
        replacement = re.sub(r"Kind:[ \-\w]+\n", "", replacement)
        replacement = re.sub(r"Language:[ \-\w]+\n", "", replacement)
        return replacement

    def add_padding_to_timestamp(self, contents):
        """Add 00 to padding timestamp of to srt format

        :contents -- contents of vtt file
        """
        find_srt = Template(r'$a,$b --> $a,$b(?:[ \-\w]+:[\w\%\d:,.]+)*\n')
        minute = r"((?:\d\d:){1}\d\d)"
        second = r"((?:\d\d:){0}\d\d)"
        padding_minute = find_srt.substitute(a=minute, b=r"(\d{0,3})")
        padding_second = find_srt.substitute(a=second, b=r"(\d{0,3})")
        replacement = re.sub(
            padding_minute, r"00:\1,\2 --> 00:\3,\4\n", contents)
        return re.sub(padding_second, r"00:00:\1,\2 --> 00:00:\3,\4\n", replacement)

    def convert_timestamp(self, contents):
        """Convert timestamp of vtt file to srt format

        :contents -- contents of vtt file
        """
        find_vtt = Template(r'$a.$b --> $a.$b(?:[ \-\w]+:[\w\%\d:,.]+)*\n')
        all_timestamp = find_vtt.substitute(
            a=r"((?:\d\d:){0,2}\d\d)", b=r"(\d{0,3})")
        return self.add_padding_to_timestamp(re.sub(all_timestamp, r"\1,\2 --> \3,\4\n", contents))

    def convert_content(self, contents):
        """Convert content of vtt file to srt format

        :contents -- contents of vtt file
        """
        replacement = self.convert_timestamp(contents)
        replacement = self.convert_header(replacement)
        replacement = re.sub(r"<c[.\w\d]*>", "", replacement)
        replacement = re.sub(r"</c>", "", replacement)
        replacement = re.sub(r"<\d\d:\d\d:\d\d.\d\d\d>", "", replacement)
        replacement = re.sub(
            r"::[\-\w]+\([\-.\w\d]+\)[ ]*{[.,:;\(\) \-\w\d]+\n }\n", "", replacement)
        replacement = re.sub(r"Style:\n##\n", "", replacement)
        replacement = self.add_sequence_numbers(replacement)

        return replacement

    def has_timestamp(self, content):
        """Check if line is a timestamp srt format

        :contents -- contents of vtt file
        """
        return re.match(r"((\d\d:){2}\d\d),(\d{3}) --> ((\d\d:){2}\d\d),(\d{3})", content) is not None

    def add_sequence_numbers(self, contents):
        """Adds sequence numbers to subtitle contents and returns new subtitle contents

        :contents -- contents of vtt file
        """
        output = ''
        lines = contents.split('\n')
        i = 1
        for line in lines:
            if self.has_timestamp(line):
                output += str(i) + '\n'
                i += 1
            output += line + '\n'
        return output

    def write_file(self, filename: str, data, encoding_format: str = "utf-8"):
        """Create a file with some data

        :filename -- filename pat
        :data -- data to write
        :encoding_format -- encoding format
        """
        try:
            with open(filename, "w", encoding=encoding_format) as file:
                file.writelines(str(data))
        except IOError:
            filename = filename.split(os.sep)[-1]
            with open(filename, "w", encoding=encoding_format) as file:
                file.writelines(str(data))
        print(f"file created {filename}\n")

    def read_file(self, filename: str, encoding_format: str = "utf-8"):
        """Read a file text

        :filename -- filename path
        :encoding_format -- encoding format
        """
        content: str = ''
        with open(filename, mode="r", encoding=encoding_format) as file:
            print(f"file being read: {filename}\n")
            content = file.read()

        return content

    def process(self, filename: str, encoding_format: str = "utf-8"):
        """Convert vtt file to a srt file

        :str_name_file -- filename path
        :encoding_format -- encoding format
        """
        file_contents: str = self.read_file(filename, encoding_format)
        str_data: str = ""
        str_data = str_data + self.convert_content(file_contents)
        filename = filename.replace(".vtt", ".srt")
        self.write_file(filename, str_data, encoding_format)

Methods

def add_padding_to_timestamp(self, contents)

Add 00 to padding timestamp of to srt format

:contents – contents of vtt file

Expand source code
def add_padding_to_timestamp(self, contents):
    """Add 00 to padding timestamp of to srt format

    :contents -- contents of vtt file
    """
    find_srt = Template(r'$a,$b --> $a,$b(?:[ \-\w]+:[\w\%\d:,.]+)*\n')
    minute = r"((?:\d\d:){1}\d\d)"
    second = r"((?:\d\d:){0}\d\d)"
    padding_minute = find_srt.substitute(a=minute, b=r"(\d{0,3})")
    padding_second = find_srt.substitute(a=second, b=r"(\d{0,3})")
    replacement = re.sub(
        padding_minute, r"00:\1,\2 --> 00:\3,\4\n", contents)
    return re.sub(padding_second, r"00:00:\1,\2 --> 00:00:\3,\4\n", replacement)
def add_sequence_numbers(self, contents)

Adds sequence numbers to subtitle contents and returns new subtitle contents

:contents – contents of vtt file

Expand source code
def add_sequence_numbers(self, contents):
    """Adds sequence numbers to subtitle contents and returns new subtitle contents

    :contents -- contents of vtt file
    """
    output = ''
    lines = contents.split('\n')
    i = 1
    for line in lines:
        if self.has_timestamp(line):
            output += str(i) + '\n'
            i += 1
        output += line + '\n'
    return output
def convert_content(self, contents)

Convert content of vtt file to srt format

:contents – contents of vtt file

Expand source code
def convert_content(self, contents):
    """Convert content of vtt file to srt format

    :contents -- contents of vtt file
    """
    replacement = self.convert_timestamp(contents)
    replacement = self.convert_header(replacement)
    replacement = re.sub(r"<c[.\w\d]*>", "", replacement)
    replacement = re.sub(r"</c>", "", replacement)
    replacement = re.sub(r"<\d\d:\d\d:\d\d.\d\d\d>", "", replacement)
    replacement = re.sub(
        r"::[\-\w]+\([\-.\w\d]+\)[ ]*{[.,:;\(\) \-\w\d]+\n }\n", "", replacement)
    replacement = re.sub(r"Style:\n##\n", "", replacement)
    replacement = self.add_sequence_numbers(replacement)

    return replacement
def convert_header(self, contents)

Convert of vtt header to srt format

:contents – contents of vtt file

Expand source code
def convert_header(self, contents):
    """Convert of vtt header to srt format

    :contents -- contents of vtt file
    """
    replacement = re.sub(r"WEBVTT\n", "", contents)
    replacement = re.sub(r"Kind:[ \-\w]+\n", "", replacement)
    replacement = re.sub(r"Language:[ \-\w]+\n", "", replacement)
    return replacement
def convert_timestamp(self, contents)

Convert timestamp of vtt file to srt format

:contents – contents of vtt file

Expand source code
def convert_timestamp(self, contents):
    """Convert timestamp of vtt file to srt format

    :contents -- contents of vtt file
    """
    find_vtt = Template(r'$a.$b --> $a.$b(?:[ \-\w]+:[\w\%\d:,.]+)*\n')
    all_timestamp = find_vtt.substitute(
        a=r"((?:\d\d:){0,2}\d\d)", b=r"(\d{0,3})")
    return self.add_padding_to_timestamp(re.sub(all_timestamp, r"\1,\2 --> \3,\4\n", contents))
def has_timestamp(self, content)

Check if line is a timestamp srt format

:contents – contents of vtt file

Expand source code
def has_timestamp(self, content):
    """Check if line is a timestamp srt format

    :contents -- contents of vtt file
    """
    return re.match(r"((\d\d:){2}\d\d),(\d{3}) --> ((\d\d:){2}\d\d),(\d{3})", content) is not None
def process(self, filename: str, encoding_format: str = 'utf-8')

Convert vtt file to a srt file

:str_name_file – filename path :encoding_format – encoding format

Expand source code
def process(self, filename: str, encoding_format: str = "utf-8"):
    """Convert vtt file to a srt file

    :str_name_file -- filename path
    :encoding_format -- encoding format
    """
    file_contents: str = self.read_file(filename, encoding_format)
    str_data: str = ""
    str_data = str_data + self.convert_content(file_contents)
    filename = filename.replace(".vtt", ".srt")
    self.write_file(filename, str_data, encoding_format)
def read_file(self, filename: str, encoding_format: str = 'utf-8')

Read a file text

:filename – filename path :encoding_format – encoding format

Expand source code
def read_file(self, filename: str, encoding_format: str = "utf-8"):
    """Read a file text

    :filename -- filename path
    :encoding_format -- encoding format
    """
    content: str = ''
    with open(filename, mode="r", encoding=encoding_format) as file:
        print(f"file being read: {filename}\n")
        content = file.read()

    return content
def write_file(self, filename: str, data, encoding_format: str = 'utf-8')

Create a file with some data

:filename – filename pat :data – data to write :encoding_format – encoding format

Expand source code
def write_file(self, filename: str, data, encoding_format: str = "utf-8"):
    """Create a file with some data

    :filename -- filename pat
    :data -- data to write
    :encoding_format -- encoding format
    """
    try:
        with open(filename, "w", encoding=encoding_format) as file:
            file.writelines(str(data))
    except IOError:
        filename = filename.split(os.sep)[-1]
        with open(filename, "w", encoding=encoding_format) as file:
            file.writelines(str(data))
    print(f"file created {filename}\n")