srt-3.5.3/0000775000175000017500000000000014410451013013637 5ustar alexlembckealexlembckesrt-3.5.3/tox.ini0000664000175000017500000000234014410451013015151 0ustar alexlembckealexlembcke[tox] envlist = python [testenv] deps = -rtests/requirements.txt commands = {basepython} --version pytest -vv -n auto allowlist_externals = {basepython} pytest setenv= release: HYPOTHESIS_PROFILE=release [testenv:doctest] deps = {[testenv]deps} commands = pytest --doctest-modules [testenv:coverage] passenv = TRAVIS TRAVIS_JOB_ID TRAVIS_BRANCH deps = {[testenv]deps} coverage commands = coverage erase pytest -vv --cov=srt --cov-branch --cov-fail-under=100 --cov-report term-missing [testenv:pylint] skipsdist = True deps = {[testenv]deps} pylint commands = # C0330: https://github.com/psf/black/issues/1178 # R0913: These are intentional design decisions, so leave them. # R0205, R1725, C0209: We still support py2. pylint --disable=C0330,R0913,R0205,R1725,C0209 srt.py [testenv:black] skipsdist = True allowlist_externals = sh deps = black commands = black --check . sh -c 'exec black --check srt_tools/srt*' [testenv:pytype] skipsdist = True deps = {[testenv]deps} pytype commands = pytype . [testenv:bandit] skipsdist = True deps = {[testenv]deps} bandit commands = bandit srt.py [testenv:pypy3] basepython = pypy3 srt-3.5.3/.github/0000775000175000017500000000000014410451013015177 5ustar alexlembckealexlembckesrt-3.5.3/.github/workflows/0000775000175000017500000000000014410451013017234 5ustar alexlembckealexlembckesrt-3.5.3/.github/workflows/ci.yml0000664000175000017500000000175114410451013020356 0ustar alexlembckealexlembckejobs: build_and_test: name: CI strategy: matrix: # Pin to 20.04 for 3.6: https://github.com/actions/setup-python/issues/544 os: [ubuntu-20.04, macos-latest, windows-latest] python-version: ['2.7', '3.5', '3.6', '3.7', '3.8', '3.9', '3.10', '3.11'] runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - run: python --version - run: pip install -U pip - run: pip install -U tox - if: matrix.python-version == '3.9' && startsWith(matrix.os, 'ubuntu-') run: | echo "TOXENV=doctest,black,pylint,pytype,bandit,coverage" >> "$GITHUB_ENV" - run: tox env: TOXENV: ${{ env.TOXENV }} - if: matrix.python-version == '3.9' && startsWith(matrix.os, 'ubuntu-') uses: AndreMiras/coveralls-python-action@develop on: push: pull_request: workflow_dispatch: srt-3.5.3/MANIFEST.in0000664000175000017500000000025014410451013015372 0ustar alexlembckealexlembckeinclude LICENSE include MANIFEST.in include README.rst recursive-include docs * recursive-include tests * recursive-exclude * *.py[co] recursive-exclude * __pycache__ srt-3.5.3/srt_tools/0000775000175000017500000000000014410451013015667 5ustar alexlembckealexlembckesrt-3.5.3/srt_tools/srt-normalise0000775000175000017500000000142614410451013020417 0ustar alexlembckealexlembcke#!/usr/bin/env python """Takes a badly formatted SRT file and outputs a strictly valid one.""" import srt_tools.utils import logging log = logging.getLogger(__name__) def main(): examples = {"Normalise a subtitle": "srt normalise -i bad.srt -o good.srt"} args = srt_tools.utils.basic_parser( description=__doc__, examples=examples, hide_no_strict=True ).parse_args() logging.basicConfig(level=args.log_level) srt_tools.utils.set_basic_args(args) output = srt_tools.utils.compose_suggest_on_fail(args.input, strict=args.strict) try: args.output.write(output) except (UnicodeEncodeError, TypeError): # Python 2 fallback args.output.write(output.encode(args.encoding)) if __name__ == "__main__": # pragma: no cover main() srt-3.5.3/srt_tools/srt-play0000775000175000017500000000300714410451013017370 0ustar alexlembckealexlembcke#!/usr/bin/env python """Play subtitles with correct timing to stdout.""" from __future__ import print_function import logging from threading import Timer, Lock import srt_tools.utils import sys import time log = logging.getLogger(__name__) output_lock = Lock() def print_sub(sub, encoding): log.debug("Timer woke up to print %s", sub.content) with output_lock: try: sys.stdout.write(sub.content + "\n\n") except UnicodeEncodeError: # Python 2 fallback sys.stdout.write(sub.content.encode(encoding) + "\n\n") sys.stdout.flush() def schedule(subs, encoding): timers = set() log.debug("Scheduling subtitles") for sub in subs: secs = sub.start.total_seconds() cur_timer = Timer(secs, print_sub, [sub, encoding]) cur_timer.name = "%s:%s" % (sub.index, secs) cur_timer.daemon = True log.debug('Adding "%s" to schedule queue', cur_timer.name) timers.add(cur_timer) for timer in timers: log.debug('Starting timer for "%s"', timer.name) timer.start() while any(t.is_alive() for t in timers): time.sleep(0.5) def main(): examples = {"Play a subtitle": "srt play -i foo.srt"} args = srt_tools.utils.basic_parser( description=__doc__, examples=examples, no_output=True ).parse_args() logging.basicConfig(level=args.log_level) srt_tools.utils.set_basic_args(args) schedule(args.input, args.encoding) if __name__ == "__main__": # pragma: no cover main() srt-3.5.3/srt_tools/srt-fixed-timeshift0000775000175000017500000000251214410451013021514 0ustar alexlembckealexlembcke#!/usr/bin/env python """Shifts a subtitle by a fixed number of seconds.""" import datetime import srt_tools.utils import logging log = logging.getLogger(__name__) def parse_args(): examples = { "Make all subtitles 5 seconds later": "srt fixed-timeshift --seconds 5", "Make all subtitles 5 seconds earlier": "srt fixed-timeshift --seconds -5", } parser = srt_tools.utils.basic_parser(description=__doc__, examples=examples) parser.add_argument( "--seconds", type=float, required=True, help="how many seconds to shift" ) return parser.parse_args() def scalar_correct_subs(subtitles, seconds_to_shift): td_to_shift = datetime.timedelta(seconds=seconds_to_shift) for subtitle in subtitles: subtitle.start += td_to_shift subtitle.end += td_to_shift yield subtitle def main(): args = parse_args() logging.basicConfig(level=args.log_level) srt_tools.utils.set_basic_args(args) corrected_subs = scalar_correct_subs(args.input, args.seconds) output = srt_tools.utils.compose_suggest_on_fail(corrected_subs, strict=args.strict) try: args.output.write(output) except (UnicodeEncodeError, TypeError): # Python 2 fallback args.output.write(output.encode(args.encoding)) if __name__ == "__main__": # pragma: no cover main() srt-3.5.3/srt_tools/srt-mux0000775000175000017500000000667414410451013017251 0ustar alexlembckealexlembcke#!/usr/bin/env python """Merge multiple subtitles together into one.""" import datetime import srt_tools.utils import logging import operator log = logging.getLogger(__name__) TOP = r"{\an8}" BOTTOM = r"{\an2}" def parse_args(): examples = { "Merge English and Chinese subtitles": "srt mux -i eng.srt -i chs.srt -o both.srt", "Merge subtitles, with one on top and one at the bottom": "srt mux -t -i eng.srt -i chs.srt -o both.srt", } parser = srt_tools.utils.basic_parser( description=__doc__, examples=examples, multi_input=True ) parser.add_argument( "--ms", metavar="MILLISECONDS", default=datetime.timedelta(milliseconds=600), type=lambda ms: datetime.timedelta(milliseconds=int(ms)), help="if subs being muxed are within this number of milliseconds " "of each other, they will have their times matched (default: 600)", ) parser.add_argument( "-w", "--width", default=5, type=int, help="how many subs to consider for time matching at once (default: %(default)s)", ) parser.add_argument( "-t", "--top-and-bottom", action="store_true", help="use SSA-style tags to place files at the top and bottom, respectively. Turns off time matching", ) parser.add_argument( "--no-time-matching", action="store_true", help="don't try to do time matching for close subtitles (see --ms)", ) return parser.parse_args() def merge_subs(subs, acceptable_diff, attr, width): """ Merge subs with similar start/end times together. This prevents the subtitles jumping around the screen. The merge is done in-place. """ sorted_subs = sorted(subs, key=operator.attrgetter(attr)) for subs in srt_tools.utils.sliding_window(sorted_subs, width=width): current_sub = subs[0] future_subs = subs[1:] current_comp = getattr(current_sub, attr) for future_sub in future_subs: future_comp = getattr(future_sub, attr) if current_comp + acceptable_diff > future_comp: log.debug( "Merging %d's %s time into %d", future_sub.index, attr, current_sub.index, ) setattr(future_sub, attr, current_comp) else: # Since these are sorted, and this one didn't match, we can be # sure future ones won't match either. break def main(): args = parse_args() logging.basicConfig(level=args.log_level) srt_tools.utils.set_basic_args(args) muxed_subs = [] for idx, subs in enumerate(args.input): for sub in subs: if args.top_and_bottom: if idx % 2 == 0: sub.content = TOP + sub.content else: sub.content = BOTTOM + sub.content muxed_subs.append(sub) if args.no_time_matching or not args.top_and_bottom: merge_subs(muxed_subs, args.ms, "start", args.width) merge_subs(muxed_subs, args.ms, "end", args.width) output = srt_tools.utils.compose_suggest_on_fail(muxed_subs, strict=args.strict) try: args.output.write(output) except (UnicodeEncodeError, TypeError): # Python 2 fallback args.output.write(output.encode(args.encoding)) if __name__ == "__main__": # pragma: no cover main() srt-3.5.3/srt_tools/srt-process0000775000175000017500000000313414410451013020102 0ustar alexlembckealexlembcke#!/usr/bin/env python """Process subtitle text content using arbitrary Python code.""" import importlib import srt_tools.utils import logging log = logging.getLogger(__name__) def strip_to_matching_lines_only(subtitles, imports, func_str): for import_name in imports: real_import = importlib.import_module(import_name) globals()[import_name] = real_import func = eval(func_str) # pylint: disable-msg=eval-used for subtitle in subtitles: subtitle.content = func(subtitle.content) yield subtitle def parse_args(): examples = { "Strip HTML-like symbols from a subtitle": """srt process -m re -f 'lambda sub: re.sub("<[^<]+?>", "", sub)'""" } parser = srt_tools.utils.basic_parser(description=__doc__, examples=examples) parser.add_argument( "-f", "--func", help="a function to use to process lines", required=True ) parser.add_argument( "-m", "--module", help="modules to import in the function context", action="append", default=[], ) return parser.parse_args() def main(): args = parse_args() logging.basicConfig(level=args.log_level) srt_tools.utils.set_basic_args(args) processed_subs = strip_to_matching_lines_only(args.input, args.module, args.func) output = srt_tools.utils.compose_suggest_on_fail(processed_subs, strict=args.strict) try: args.output.write(output) except (UnicodeEncodeError, TypeError): # Python 2 fallback args.output.write(output.encode(args.encoding)) if __name__ == "__main__": # pragma: no cover main() srt-3.5.3/srt_tools/__init__.py0000775000175000017500000000000014410451013017771 0ustar alexlembckealexlembckesrt-3.5.3/srt_tools/srt-lines-matching0000775000175000017500000000462214410451013021331 0ustar alexlembckealexlembcke#!/usr/bin/env python """Filter subtitles that match or don't match a particular pattern.""" import importlib import srt_tools.utils import logging log = logging.getLogger(__name__) def strip_to_matching_lines_only(subtitles, imports, func_str, invert, per_sub): for import_name in imports: real_import = importlib.import_module(import_name) globals()[import_name] = real_import raw_func = eval(func_str) # pylint: disable-msg=eval-used if invert: func = lambda line: not raw_func(line) else: func = raw_func for subtitle in subtitles: if per_sub: if not func(subtitle.content): subtitle.content = "" else: subtitle.content = "\n".join( line for line in subtitle.content.splitlines() if func(line) ) yield subtitle def parse_args(): examples = { "Only include Chinese lines": "srt lines-matching -m hanzidentifier -f hanzidentifier.has_chinese", "Exclude all lines which only contain numbers": "srt lines-matching -v -f 'lambda x: x.isdigit()'", } parser = srt_tools.utils.basic_parser(description=__doc__, examples=examples) parser.add_argument( "-f", "--func", help="a function to use to match lines", required=True ) parser.add_argument( "-m", "--module", help="modules to import in the function context", action="append", default=[], ) parser.add_argument( "-s", "--per-subtitle", help="match the content of each subtitle, not each line", action="store_true", ) parser.add_argument( "-v", "--invert", help="invert matching -- only match lines returning False", action="store_true", ) return parser.parse_args() def main(): args = parse_args() logging.basicConfig(level=args.log_level) srt_tools.utils.set_basic_args(args) matching_subtitles_only = strip_to_matching_lines_only( args.input, args.module, args.func, args.invert, args.per_subtitle ) output = srt_tools.utils.compose_suggest_on_fail( matching_subtitles_only, strict=args.strict ) try: args.output.write(output) except (UnicodeEncodeError, TypeError): # Python 2 fallback args.output.write(output.encode(args.encoding)) if __name__ == "__main__": # pragma: no cover main() srt-3.5.3/srt_tools/srt0000775000175000017500000000245514410451013016433 0ustar alexlembckealexlembcke#!/usr/bin/env python import os import sys import errno SRT_BIN_PREFIX = "srt-" def find_srt_commands_in_path(): paths = os.environ.get("PATH", "").split(os.pathsep) for path in paths: try: path_files = os.listdir(path) except OSError as thrown_exc: if thrown_exc.errno in (errno.ENOENT, errno.ENOTDIR): continue else: raise for path_file in path_files: if path_file.startswith(SRT_BIN_PREFIX): yield path_file[len(SRT_BIN_PREFIX) :] def show_help(): print( "Available commands " "(pass --help to a specific command for usage information):\n" ) commands = sorted(set(find_srt_commands_in_path())) for command in commands: print("- {}".format(command)) def main(): if len(sys.argv) < 2 or sys.argv[1].startswith("-"): show_help() sys.exit(0) command = sys.argv[1] available_commands = find_srt_commands_in_path() if command not in available_commands: print('Unknown command: "{}"\n'.format(command)) show_help() sys.exit(1) real_command = SRT_BIN_PREFIX + command os.execvp(real_command, [real_command] + sys.argv[2:]) if __name__ == "__main__": # pragma: no cover main() srt-3.5.3/srt_tools/tests/0000775000175000017500000000000014410451013017031 5ustar alexlembckealexlembckesrt-3.5.3/srt_tools/tests/files/0000775000175000017500000000000014410451013020133 5ustar alexlembckealexlembckesrt-3.5.3/srt_tools/tests/files/gb2312.srt0000664000175000017500000000026614410451013021571 0ustar alexlembckealexlembcke2 00:00:27,000 --> 00:00:30,730 宇宙守护神 保护我们远离邪恶 4 00:00:31,500 --> 00:00:34,100 他能徒手抓住子弹 拦住疾速的火车 6 00:00:34,100 --> 00:00:36,570 轻轻一跳就能飞跃高楼 srt-3.5.3/srt_tools/tests/files/ascii.srt0000664000175000017500000000020214410451013021747 0ustar alexlembckealexlembcke2 00:00:27,000 --> 00:00:30,730 ascii 4 00:00:31,500 --> 00:00:34,100 oh look 6 00:00:34,100 --> 00:00:36,570 ascii everywhere srt-3.5.3/srt_tools/tests/__init__.py0000775000175000017500000000000014410451013021133 0ustar alexlembckealexlembckesrt-3.5.3/srt_tools/tests/test_srt_tools.py0000664000175000017500000000663214410451013022501 0ustar alexlembckealexlembcke#!/usr/bin/env python import os import subprocess import sys import tempfile try: from shlex import quote except ImportError: # <3.3 fallback from pipes import quote sample_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "files") if os.name == "nt": # Sigh, shlex.quote quotes incorrectly on Windows quote = lambda x: windows_crappy_quote(x) def run_srt_util(cmd, shell=False, encoding="utf-8-sig"): extra_env = {} env = {"PYTHONPATH": ".", "SystemRoot": r"C:\Windows"} env.update(extra_env) raw_out = subprocess.check_output(cmd, shell=shell, env=env) return raw_out.decode(encoding) def windows_crappy_quote(data): """ I'm 100% sure this isn't secure, please don't use it with untrusted code. I beg you. """ data = data.replace('"', '""') return '"' + data + '"' def assert_supports_all_io_methods(cmd, exclude_output=False, exclude_stdin=False): # TODO: pytype doesn't like the mixed types in the matrix, but this works # fine. Maybe it would be happier with a namedtuple? cmd[0] = "srt_tools/" + cmd[0] # pytype: disable=unsupported-operands cmd.insert(0, sys.executable) # pytype: disable=attribute-error in_file = os.path.join(sample_dir, "ascii.srt") in_file_gb = os.path.join(sample_dir, "gb2312.srt") fd, out_file = tempfile.mkstemp() # This is accessed by filename, not fd os.close(fd) outputs = [] cmd_string = " ".join(quote(x) for x in cmd) try: outputs.append(run_srt_util(cmd + ["-i", in_file])) if not exclude_stdin: outputs.append( run_srt_util("%s < %s" % (cmd_string, quote(in_file)), shell=True) ) if not exclude_output: run_srt_util(cmd + ["-i", in_file, "-o", out_file]) run_srt_util( cmd + ["-i", in_file_gb, "-o", out_file, "-e", "gb2312"], encoding="gb2312", ) if not exclude_stdin: run_srt_util( "%s < %s > %s" % (cmd_string, quote(in_file), quote(out_file)), shell=True, ) run_srt_util( "%s < %s > %s" % (cmd_string + " -e gb2312", quote(in_file), quote(out_file)), shell=True, encoding="gb2312", ) assert len(set(outputs)) == 1, repr(outputs) if os.name == "nt": assert "\r\n" in outputs[0] else: assert "\r\n" not in outputs[0] finally: os.remove(out_file) def test_tools_support(): matrix = [ (["srt-normalise"], False), (["srt-deduplicate"], False), (["srt-fixed-timeshift", "--seconds", "5"], False), ( [ "srt-linear-timeshift", "--f1", "00:00:01,000", "--f2", "00:00:02,000", "--t1", "00:00:03,000", "--t2", "00:00:04,000", ], False, ), (["srt-lines-matching", "-f", "lambda x: True"], False), (["srt-process", "-f", "lambda x: x"], False), (["srt-mux"], False, True), (["srt-mux", "-t"], False, True), # Need to sort out time/thread issues # (('srt-play'), True), ] for args in matrix: assert_supports_all_io_methods(*args) srt-3.5.3/srt_tools/srt-deduplicate0000775000175000017500000000556614410451013020722 0ustar alexlembckealexlembcke#!/usr/bin/env python """Deduplicate repeated subtitles.""" import datetime import srt_tools.utils import logging import operator log = logging.getLogger(__name__) try: # Python 2 range = xrange # pytype: disable=name-error except NameError: pass def parse_args(): examples = { "Remove duplicated subtitles within 5 seconds of each other": "srt deduplicate -i duplicated.srt", "Remove duplicated subtitles within 500 milliseconds of each other": "srt deduplicate -t 500 -i duplicated.srt", "Remove duplicated subtitles regardless of temporal proximity": "srt deduplicate -t 0 -i duplicated.srt", } parser = srt_tools.utils.basic_parser( description=__doc__, examples=examples, ) parser.add_argument( "-t", "--ms", metavar="MILLISECONDS", default=datetime.timedelta(milliseconds=5000), type=lambda ms: datetime.timedelta(milliseconds=int(ms)), help="how many milliseconds distance a subtitle start time must be " "within of another to be considered a duplicate " "(default: 5000ms)", ) return parser.parse_args() def deduplicate_subs(orig_subs, acceptable_diff): """Remove subtitles with duplicated content.""" indices_to_remove = [] # If we only store the subtitle itself and compare that, it's possible that # we'll not only remove the duplicate, but also the _original_ subtitle if # they have the same sub index/times/etc. # # As such, we need to also store the index in the original subs list that # this entry belongs to for each subtitle prior to sorting. sorted_subs = sorted( enumerate(orig_subs), key=lambda sub: (sub[1].content, sub[1].start) ) for subs in srt_tools.utils.sliding_window(sorted_subs, width=2, inclusive=False): cur_idx, cur_sub = subs[0] next_idx, next_sub = subs[1] if cur_sub.content == next_sub.content and ( not acceptable_diff or cur_sub.start + acceptable_diff >= next_sub.start ): log.debug( "Marking l%d/s%d for removal, duplicate of l%d/s%d", next_idx, next_sub.index, cur_idx, cur_sub.index, ) indices_to_remove.append(next_idx) offset = 0 for idx in indices_to_remove: del orig_subs[idx - offset] offset += 1 def main(): args = parse_args() logging.basicConfig(level=args.log_level) srt_tools.utils.set_basic_args(args) subs = list(args.input) deduplicate_subs(subs, args.ms) output = srt_tools.utils.compose_suggest_on_fail(subs, strict=args.strict) try: args.output.write(output) except (UnicodeEncodeError, TypeError): # Python 2 fallback args.output.write(output.encode(args.encoding)) if __name__ == "__main__": # pragma: no cover main() srt-3.5.3/srt_tools/srt-linear-timeshift0000775000175000017500000000620614410451013021673 0ustar alexlembckealexlembcke#!/usr/bin/env python """Perform linear time correction on a subtitle.""" from __future__ import division import srt import datetime import srt_tools.utils import logging log = logging.getLogger(__name__) def timedelta_to_milliseconds(delta): return delta.days * 86400000 + delta.seconds * 1000 + delta.microseconds / 1000 def parse_args(): def srt_timestamp_to_milliseconds(parser, arg): try: delta = srt.srt_timestamp_to_timedelta(arg) except ValueError: parser.error("not a valid SRT timestamp: %s" % arg) else: return timedelta_to_milliseconds(delta) examples = { "Stretch out a subtitle so that second 1 is 1, 2 is 3, 3 is 5, etc": "srt linear-timeshift --f1 00:00:01,000 --t1 00:00:01,000 --f2 00:00:02,000 --t2 00:00:03,000" } parser = srt_tools.utils.basic_parser(description=__doc__, examples=examples) parser.add_argument( "--from-start", "--f1", type=lambda arg: srt_timestamp_to_milliseconds(parser, arg), required=True, help="the first desynchronised timestamp", ) parser.add_argument( "--to-start", "--t1", type=lambda arg: srt_timestamp_to_milliseconds(parser, arg), required=True, help="the first synchronised timestamp", ) parser.add_argument( "--from-end", "--f2", type=lambda arg: srt_timestamp_to_milliseconds(parser, arg), required=True, help="the second desynchronised timestamp", ) parser.add_argument( "--to-end", "--t2", type=lambda arg: srt_timestamp_to_milliseconds(parser, arg), required=True, help="the second synchronised timestamp", ) return parser.parse_args() def calc_correction(to_start, to_end, from_start, from_end): angular = (to_end - to_start) / (from_end - from_start) linear = to_end - angular * from_end return angular, linear def correct_time(current_msecs, angular, linear): return round(current_msecs * angular + linear) def correct_timedelta(bad_delta, angular, linear): bad_msecs = timedelta_to_milliseconds(bad_delta) good_msecs = correct_time(bad_msecs, angular, linear) good_delta = datetime.timedelta(milliseconds=good_msecs) return good_delta def linear_correct_subs(subtitles, angular, linear): for subtitle in subtitles: subtitle.start = correct_timedelta(subtitle.start, angular, linear) subtitle.end = correct_timedelta(subtitle.end, angular, linear) yield subtitle def main(): args = parse_args() logging.basicConfig(level=args.log_level) angular, linear = calc_correction( args.to_start, args.to_end, args.from_start, args.from_end ) srt_tools.utils.set_basic_args(args) corrected_subs = linear_correct_subs(args.input, angular, linear) output = srt_tools.utils.compose_suggest_on_fail(corrected_subs, strict=args.strict) try: args.output.write(output) except (UnicodeEncodeError, TypeError): # Python 2 fallback args.output.write(output.encode(args.encoding)) if __name__ == "__main__": # pragma: no cover main() srt-3.5.3/srt_tools/utils.py0000775000175000017500000001642214410451013017411 0ustar alexlembckealexlembcke#!/usr/bin/env python import argparse import codecs import srt import logging import sys import itertools import os try: from collections.abc import MutableSequence except ImportError: from collections import MutableSequence PROG_NAME = os.path.basename(sys.argv[0]).replace("-", " ", 1) STDIN_BYTESTREAM = getattr(sys.stdin, "buffer", sys.stdin) STDOUT_BYTESTREAM = getattr(sys.stdout, "buffer", sys.stdout) DASH_STREAM_MAP = {"input": STDIN_BYTESTREAM, "output": STDOUT_BYTESTREAM} try: # Python 2 range = xrange # pytype: disable=name-error except NameError: pass log = logging.getLogger(__name__) def noop(stream): """ Used when we didn't explicitly specify a stream to avoid using codecs.get{reader,writer} """ return stream def dash_to_stream(arg, arg_type): if arg == "-": return DASH_STREAM_MAP[arg_type] return arg def basic_parser( description=None, multi_input=False, no_output=False, examples=None, hide_no_strict=False, ): example_lines = [] if examples is not None: example_lines.append("examples:") for desc, code in examples.items(): example_lines.append(" {}".format(desc)) example_lines.append(" $ {}\n".format(code)) parser = argparse.ArgumentParser( prog=PROG_NAME, description=description, epilog="\n".join(example_lines), formatter_class=argparse.RawDescriptionHelpFormatter, ) # Cannot use argparse.FileType as we need to know the encoding from the # args if multi_input: parser.add_argument( "--input", "-i", metavar="FILE", action="append", type=lambda arg: dash_to_stream(arg, "input"), help="the files to process", required=True, ) else: parser.add_argument( "--input", "-i", metavar="FILE", default=STDIN_BYTESTREAM, type=lambda arg: dash_to_stream(arg, "input"), help="the file to process (default: stdin)", ) if not no_output: parser.add_argument( "--output", "-o", metavar="FILE", default=STDOUT_BYTESTREAM, type=lambda arg: dash_to_stream(arg, "output"), help="the file to write to (default: stdout)", ) if not multi_input: parser.add_argument( "--inplace", "-p", action="store_true", help="modify file in place", ) shelp = "allow blank lines in output, your media player may explode" if hide_no_strict: shelp = argparse.SUPPRESS parser.add_argument("--no-strict", action="store_false", dest="strict", help=shelp) parser.add_argument( "--debug", action="store_const", dest="log_level", const=logging.DEBUG, default=logging.INFO, help="enable debug logging", ) parser.add_argument( "--ignore-parsing-errors", "-c", action="store_true", help="try to keep going, even if there are parsing errors", ) parser.add_argument( "--encoding", "-e", help="the encoding to read/write files in (default: utf8)" ) return parser def set_basic_args(args): # TODO: dedupe some of this if getattr(args, "inplace", None): if args.input == DASH_STREAM_MAP["input"]: raise ValueError("Cannot use --inplace on stdin") if args.output != DASH_STREAM_MAP["output"]: raise ValueError("Cannot use -o and -p together") args.output = args.input for stream_name in ("input", "output"): log.debug('Processing stream "%s"', stream_name) try: stream = getattr(args, stream_name) except AttributeError: # For example, in the case of no_output continue # We don't use system default encoding, because usually one runs this # on files they got from elsewhere. As such, be opinionated that these # files are probably UTF-8. Looking for the BOM on reading allows us to # be more liberal with what we accept, without adding BOMs on write. read_encoding = args.encoding or "utf-8-sig" write_encoding = args.encoding or "utf-8" r_enc = codecs.getreader(read_encoding) w_enc = codecs.getwriter(write_encoding) log.debug("Got %r as stream", stream) # We don't use encoding= option to open because we want to have the # same universal newlines behaviour as STD{IN,OUT}_BYTESTREAM if stream in DASH_STREAM_MAP.values(): log.debug("%s in DASH_STREAM_MAP", stream_name) if stream is args.input: args.input = srt.parse( r_enc(args.input).read(), ignore_errors=args.ignore_parsing_errors ) elif stream is args.output: # Since args.output is not in text mode (since we didn't # earlier know the encoding), we have no universal newline # support and need to do it ourselves args.output = w_enc(args.output) else: log.debug("%s not in DASH_STREAM_MAP", stream_name) if stream is args.input: if isinstance(args.input, MutableSequence): for i, input_fn in enumerate(args.input): if input_fn in DASH_STREAM_MAP.values(): if stream is args.input: args.input[i] = srt.parse( r_enc(input_fn).read(), ignore_errors=args.ignore_parsing_errors, ) else: f = r_enc(open(input_fn, "rb")) with f: args.input[i] = srt.parse( f.read(), ignore_errors=args.ignore_parsing_errors ) else: f = r_enc(open(stream, "rb")) with f: args.input = srt.parse( f.read(), ignore_errors=args.ignore_parsing_errors ) else: args.output = w_enc(open(args.output, "wb")) def compose_suggest_on_fail(subs, strict=True): try: return srt.compose(subs, strict=strict, eol=os.linesep, in_place=True) except srt.SRTParseError as thrown_exc: # Since `subs` is actually a generator log.critical( "Parsing failed, maybe you need to pass a different encoding " "with --encoding?" ) raise def sliding_window(seq, width=2, inclusive=True): """ If inclusive is True, we also include final elements where len(sliced) < width. """ seq_iter = iter(seq) # Consume seq_iter up to width sliced = tuple(itertools.islice(seq_iter, width)) if not inclusive and len(sliced) != width: return yield sliced for elem in seq_iter: sliced = sliced[1:] + (elem,) yield sliced if inclusive: for idx in range(len(sliced)): if idx != 0: yield sliced[idx:] srt-3.5.3/srt_tools/README.rst0000664000175000017500000000563214410451013017364 0ustar alexlembckealexlembckesrt_tools contains utilities written to process SRT files. All utilities use the Python srt_ library internally. .. _srt: https://github.com/cdown/srt Usage ----- You can call ``srt`` directly to see a list of all available utilities. .. code:: srt [utility-name] [args ...] Arbitrary things can be done with *srt process* and *srt lines-matching*, for example: .. code:: # Strip HTML srt process -m re -f 'lambda sub: re.sub("<[^<]+?>", "", sub)' # Only keep Chinese subtitles srt lines-matching -m hanzidentifier -f hanzidentifier.has_chinese Utilities --------- - *deduplicate* removes subtitles with duplicate content. If you have subtitles which mistakenly repeat the same content in different subs at roughly the same time, you can run this tool to remove them. - *fixed-timeshift* does fixed time correction. For example, if you have a movie that is consistently out of sync by two seconds, you can run this tool to shift the entire subtitle two seconds ahead or behind. - *linear-timeshift* does linear time correction. If you have a movie that runs slower or faster than the subtitle that you have, it will repeatedly lose sync. This tool can apply linear time corrections to all subtitles in the SRT, resyncing it with the video. - *lines-matching* takes a function and removes lines that don't return true when passed to it. For example, you can keep only lines that contain Chinese by installing the hanzidentifier_ package, and running ``srt lines-matching -m hanzidentifier -f hanzidentifier.has_chinese < input``. - *mux* can mux_ multiple subtitles together into one. For example, if you have a Chinese subtitle and an English subtitle, and you want to have one subtitle file that contains both, this tool can do that for you. It also supports clamping subtitles starting or ending at similar times to the same time to avoid subtitles jumping around the screen. - *normalise* standardises and cleans up SRT files. For example, it removes spurious newlines, normalises timestamps, and fixes subtitle indexing to a format that all media players should accept, with no noncompliant data. - *play* plays subtitles in the terminal at the time they are scheduled to display (note: it does not clear them from the screen afterwards). If you need to fast-forward to some point, you can combine it with *fixed-timeshift*. - *process* allows processing text freely. It takes a function, similarly to *lines-matching*, and changes SRT content into the return value. For example, you can naively strip some basic HTML-like markup with ``srt process -m re -f 'lambda sub: re.sub("<[^<]+?>", "", sub)'``. HTML-like syntax is especially prevalant in `SSA/ASS`_ subtitles that have been directly converted to SRT. .. _mux: https://en.wikipedia.org/wiki/Multiplexing .. _`SSA/ASS`: https://en.wikipedia.org/wiki/SubStation_Alpha .. _hanzidentifier: https://github.com/tsroten/hanzidentifier srt-3.5.3/setup.py0000664000175000017500000000325014410451013015351 0ustar alexlembckealexlembcke#!/usr/bin/env python import codecs from setuptools import setup with codecs.open("README.rst", encoding="utf8") as readme_f: README = readme_f.read() setup( name="srt", version="3.5.3", python_requires=">=2.7", description="A tiny library for parsing, modifying, and composing SRT files.", long_description=README, author="Chris Down", author_email="chris@chrisdown.name", url="https://github.com/cdown/srt", py_modules=["srt", "srt_tools.utils"], scripts=[ "srt_tools/srt", "srt_tools/srt-deduplicate", "srt_tools/srt-normalise", "srt_tools/srt-fixed-timeshift", "srt_tools/srt-linear-timeshift", "srt_tools/srt-lines-matching", "srt_tools/srt-mux", "srt_tools/srt-play", "srt_tools/srt-process", ], license="MIT", keywords="srt", classifiers=[ "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", "Programming Language :: Python :: 2", "Programming Language :: Python :: 2.7", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Topic :: Multimedia :: Video", "Topic :: Software Development :: Libraries", "Topic :: Text Processing", ], ) srt-3.5.3/srt.py0000775000175000017500000004424014410451013015030 0ustar alexlembckealexlembcke#!/usr/bin/env python # coding=utf8 """A tiny library for parsing, modifying, and composing SRT files.""" from __future__ import unicode_literals import functools import re from datetime import timedelta import logging import io LOG = logging.getLogger(__name__) # "." is not technically valid as a delimiter, but many editors create SRT # files with this delimiter for whatever reason. Many editors and players # accept it, so we do too. RGX_TIMESTAMP_MAGNITUDE_DELIM = r"[,.:锛岋紟銆傦細]" RGX_TIMESTAMP_FIELD = r"[0-9]+" RGX_TIMESTAMP_FIELD_OPTIONAL = r"[0-9]*" RGX_TIMESTAMP = "".join( [ RGX_TIMESTAMP_MAGNITUDE_DELIM.join([RGX_TIMESTAMP_FIELD] * 3), RGX_TIMESTAMP_MAGNITUDE_DELIM, "?", RGX_TIMESTAMP_FIELD_OPTIONAL, ] ) RGX_TIMESTAMP_PARSEABLE = r"^{}$".format( "".join( [ RGX_TIMESTAMP_MAGNITUDE_DELIM.join(["(" + RGX_TIMESTAMP_FIELD + ")"] * 3), RGX_TIMESTAMP_MAGNITUDE_DELIM, "?", "(", RGX_TIMESTAMP_FIELD_OPTIONAL, ")", ] ) ) RGX_INDEX = r"-?[0-9]+\.?[0-9]*" RGX_PROPRIETARY = r"[^\r\n]*" RGX_CONTENT = r".*?" RGX_POSSIBLE_CRLF = r"\r?\n" TS_REGEX = re.compile(RGX_TIMESTAMP_PARSEABLE) MULTI_WS_REGEX = re.compile(r"\n\n+") SRT_REGEX = re.compile( r"\s*(?:({idx})\s*{eof})?({ts}) *-[ -] *> *({ts}) ?({proprietary})(?:{eof}|\Z)({content})" # Many sub editors don't add a blank line to the end, and many editors and # players accept that. We allow it to be missing in input. # # We also allow subs that are missing a double blank newline. This often # happens on subs which were first created as a mixed language subtitle, # for example chs/eng, and then were stripped using naive methods (such as # ed/sed) that don't understand newline preservation rules in SRT files. # # This means that when you are, say, only keeping chs, and the line only # contains english, you end up with not only no content, but also all of # the content lines are stripped instead of retaining a newline. r"(?:{eof}|\Z)(?:{eof}|\Z|(?=(?:{idx}\s*{eof}{ts})))" # Some SRT blocks, while this is technically invalid, have blank lines # inside the subtitle content. We look ahead a little to check that the # next lines look like an index and a timestamp as a best-effort # solution to work around these. r"(?=(?:(?:{idx}\s*{eof})?{ts}|\Z))".format( idx=RGX_INDEX, ts=RGX_TIMESTAMP, proprietary=RGX_PROPRIETARY, content=RGX_CONTENT, eof=RGX_POSSIBLE_CRLF, ), re.DOTALL, ) ZERO_TIMEDELTA = timedelta(0) # Info message if truthy return -> Function taking a Subtitle, skip if True SUBTITLE_SKIP_CONDITIONS = ( ("No content", lambda sub: not sub.content.strip()), ("Start time < 0 seconds", lambda sub: sub.start < ZERO_TIMEDELTA), ("Subtitle start time >= end time", lambda sub: sub.start >= sub.end), ) SECONDS_IN_HOUR = 3600 SECONDS_IN_MINUTE = 60 HOURS_IN_DAY = 24 MICROSECONDS_IN_MILLISECOND = 1000 try: FILE_TYPES = (file, io.IOBase) # pytype: disable=name-error except NameError: # `file` doesn't exist in Python 3 FILE_TYPES = (io.IOBase,) @functools.total_ordering class Subtitle(object): r""" The metadata relating to a single subtitle. Subtitles are sorted by start time by default. If no index was provided, index 0 will be used on writing an SRT block. :param index: The SRT index for this subtitle :type index: int or None :param start: The time that the subtitle should start being shown :type start: :py:class:`datetime.timedelta` :param end: The time that the subtitle should stop being shown :type end: :py:class:`datetime.timedelta` :param str proprietary: Proprietary metadata for this subtitle :param str content: The subtitle content. Should not contain OS-specific line separators, only \\n. This is taken care of already if you use :py:func:`srt.parse` to generate Subtitle objects. """ # pylint: disable=R0913 def __init__(self, index, start, end, content, proprietary=""): self.index = index self.start = start self.end = end self.content = content self.proprietary = proprietary def __hash__(self): return hash(frozenset(vars(self).items())) def __eq__(self, other): return vars(self) == vars(other) def __lt__(self, other): return (self.start, self.end, self.index) < ( other.start, other.end, other.index, ) def __repr__(self): # Python 2/3 cross compatibility var_items = getattr(vars(self), "iteritems", getattr(vars(self), "items")) item_list = ", ".join("%s=%r" % (k, v) for k, v in var_items()) return "%s(%s)" % (type(self).__name__, item_list) def to_srt(self, strict=True, eol="\n"): r""" Convert the current :py:class:`Subtitle` to an SRT block. :param bool strict: If disabled, will allow blank lines in the content of the SRT block, which is a violation of the SRT standard and may cause your media player to explode :param str eol: The end of line string to use (default "\\n") :returns: The metadata of the current :py:class:`Subtitle` object as an SRT formatted subtitle block :rtype: str """ output_content = self.content output_proprietary = self.proprietary if output_proprietary: # output_proprietary is output directly next to the timestamp, so # we need to add the space as a field delimiter. output_proprietary = " " + output_proprietary if strict: output_content = make_legal_content(output_content) if eol is None: eol = "\n" elif eol != "\n": output_content = output_content.replace("\n", eol) template = "{idx}{eol}{start} --> {end}{prop}{eol}{content}{eol}{eol}" return template.format( idx=self.index or 0, start=timedelta_to_srt_timestamp(self.start), end=timedelta_to_srt_timestamp(self.end), prop=output_proprietary, content=output_content, eol=eol, ) def make_legal_content(content): r""" Remove illegal content from a content block. Illegal content includes: * Blank lines * Starting or ending with a blank line .. doctest:: >>> make_legal_content('\nfoo\n\nbar\n') 'foo\nbar' :param str content: The content to make legal :returns: The legalised content :rtype: srt """ # Optimisation: Usually the content we get is legally valid. Do a quick # check to see if we really need to do anything here. This saves time from # generating legal_content by about 50%. if content and content[0] != "\n" and "\n\n" not in content: return content legal_content = MULTI_WS_REGEX.sub("\n", content.strip("\n")) LOG.info("Legalised content %r to %r", content, legal_content) return legal_content def timedelta_to_srt_timestamp(timedelta_timestamp): r""" Convert a :py:class:`~datetime.timedelta` to an SRT timestamp. .. doctest:: >>> import datetime >>> delta = datetime.timedelta(hours=1, minutes=23, seconds=4) >>> timedelta_to_srt_timestamp(delta) '01:23:04,000' :param datetime.timedelta timedelta_timestamp: A datetime to convert to an SRT timestamp :returns: The timestamp in SRT format :rtype: str """ hrs, secs_remainder = divmod(timedelta_timestamp.seconds, SECONDS_IN_HOUR) hrs += timedelta_timestamp.days * HOURS_IN_DAY mins, secs = divmod(secs_remainder, SECONDS_IN_MINUTE) msecs = timedelta_timestamp.microseconds // MICROSECONDS_IN_MILLISECOND return "%02d:%02d:%02d,%03d" % (hrs, mins, secs, msecs) def srt_timestamp_to_timedelta(timestamp): r""" Convert an SRT timestamp to a :py:class:`~datetime.timedelta`. .. doctest:: >>> srt_timestamp_to_timedelta('01:23:04,000') datetime.timedelta(seconds=4984) :param str timestamp: A timestamp in SRT format :returns: The timestamp as a :py:class:`~datetime.timedelta` :rtype: datetime.timedelta :raises TimestampParseError: If the timestamp is not parseable """ match = TS_REGEX.match(timestamp) if match is None: raise TimestampParseError("Unparseable timestamp: {}".format(timestamp)) hrs, mins, secs, msecs = [int(m) if m else 0 for m in match.groups()] return timedelta(hours=hrs, minutes=mins, seconds=secs, milliseconds=msecs) def sort_and_reindex(subtitles, start_index=1, in_place=False, skip=True): """ Reorder subtitles to be sorted by start time order, and rewrite the indexes to be in that same order. This ensures that the SRT file will play in an expected fashion after, for example, times were changed in some subtitles and they may need to be resorted. If skip=True, subtitles will also be skipped if they are considered not to be useful. Currently, the conditions to be considered "not useful" are as follows: - Content is empty, or only whitespace - The start time is negative - The start time is equal to or later than the end time .. doctest:: >>> from datetime import timedelta >>> one = timedelta(seconds=1) >>> two = timedelta(seconds=2) >>> three = timedelta(seconds=3) >>> subs = [ ... Subtitle(index=999, start=one, end=two, content='1'), ... Subtitle(index=0, start=two, end=three, content='2'), ... ] >>> list(sort_and_reindex(subs)) # doctest: +ELLIPSIS [Subtitle(...index=1...), Subtitle(...index=2...)] :param subtitles: :py:class:`Subtitle` objects in any order :param int start_index: The index to start from :param bool in_place: Whether to modify subs in-place for performance (version <=1.0.0 behaviour) :param bool skip: Whether to skip subtitles considered not useful (see above for rules) :returns: The sorted subtitles :rtype: :term:`generator` of :py:class:`Subtitle` objects """ skipped_subs = 0 for sub_num, subtitle in enumerate(sorted(subtitles), start=start_index): if not in_place: subtitle = Subtitle(**vars(subtitle)) if skip: try: _should_skip_sub(subtitle) except _ShouldSkipException as thrown_exc: if subtitle.index is None: LOG.info("Skipped subtitle with no index: %s", thrown_exc) else: LOG.info( "Skipped subtitle at index %d: %s", subtitle.index, thrown_exc ) skipped_subs += 1 continue subtitle.index = sub_num - skipped_subs yield subtitle def _should_skip_sub(subtitle): """ Check if a subtitle should be skipped based on the rules in SUBTITLE_SKIP_CONDITIONS. :param subtitle: A :py:class:`Subtitle` to check whether to skip :raises _ShouldSkipException: If the subtitle should be skipped """ for info_msg, sub_skipper in SUBTITLE_SKIP_CONDITIONS: if sub_skipper(subtitle): raise _ShouldSkipException(info_msg) def parse(srt, ignore_errors=False): r''' Convert an SRT formatted string (in Python 2, a :class:`unicode` object) to a :term:`generator` of Subtitle objects. This function works around bugs present in many SRT files, most notably that it is designed to not bork when presented with a blank line as part of a subtitle's content. .. doctest:: >>> subs = parse("""\ ... 422 ... 00:31:39,931 --> 00:31:41,931 ... Using mainly spoons, ... ... 423 ... 00:31:41,933 --> 00:31:43,435 ... we dig a tunnel under the city and release it into the wild. ... ... """) >>> list(subs) # doctest: +ELLIPSIS [Subtitle(...index=422...), Subtitle(...index=423...)] :param srt: Subtitles in SRT format :type srt: str or a file-like object :param ignore_errors: If True, garbled SRT data will be ignored, and we'll continue trying to parse the rest of the file, instead of raising :py:class:`SRTParseError` and stopping execution. :returns: The subtitles contained in the SRT file as :py:class:`Subtitle` objects :rtype: :term:`generator` of :py:class:`Subtitle` objects :raises SRTParseError: If the matches are not contiguous and ``ignore_errors`` is False. ''' expected_start = 0 # Transparently read files -- the whole thing is needed for regex's # finditer if isinstance(srt, FILE_TYPES): srt = srt.read() for match in SRT_REGEX.finditer(srt): actual_start = match.start() _check_contiguity(srt, expected_start, actual_start, ignore_errors) raw_index, raw_start, raw_end, proprietary, content = match.groups() # pytype sees that this is Optional[str] and thus complains that they # can be None, but they can't realistically be None, since we're using # finditer and all match groups are mandatory in the regex. content = content.replace("\r\n", "\n") # pytype: disable=attribute-error try: raw_index = int(raw_index) except ValueError: # Index 123.4. Handled separately, since it's a rare case and we # don't want to affect general performance. # # The pytype disable is for the same reason as content, above. raw_index = int(raw_index.split(".")[0]) # pytype: disable=attribute-error except TypeError: # There's no index, so raw_index is already set to None. We'll # handle this when rendering the subtitle with to_srt. pass yield Subtitle( index=raw_index, start=srt_timestamp_to_timedelta(raw_start), end=srt_timestamp_to_timedelta(raw_end), content=content, proprietary=proprietary, ) expected_start = match.end() _check_contiguity(srt, expected_start, len(srt), ignore_errors) def _check_contiguity(srt, expected_start, actual_start, warn_only): """ If ``warn_only`` is False, raise :py:class:`SRTParseError` with diagnostic info if expected_start does not equal actual_start. Otherwise, log a warning. :param str srt: The data being matched :param int expected_start: The expected next start, as from the last iteration's match.end() :param int actual_start: The actual start, as from this iteration's match.start() :raises SRTParseError: If the matches are not contiguous and ``warn_only`` is False """ if expected_start != actual_start: unmatched_content = srt[expected_start:actual_start] if expected_start == 0 and ( unmatched_content.isspace() or unmatched_content == "\ufeff" ): # #50: Leading whitespace has nowhere to be captured like in an # intermediate subtitle return if warn_only: LOG.warning("Skipped unparseable SRT data: %r", unmatched_content) else: raise SRTParseError(expected_start, actual_start, unmatched_content) def compose( subtitles, reindex=True, start_index=1, strict=True, eol=None, in_place=False ): r""" Convert an iterator of :py:class:`Subtitle` objects to a string of joined SRT blocks. .. doctest:: >>> from datetime import timedelta >>> start = timedelta(seconds=1) >>> end = timedelta(seconds=2) >>> subs = [ ... Subtitle(index=1, start=start, end=end, content='x'), ... Subtitle(index=2, start=start, end=end, content='y'), ... ] >>> compose(subs) # doctest: +ELLIPSIS '1\n00:00:01,000 --> 00:00:02,000\nx\n\n2\n00:00:01,000 --> ...' :param subtitles: The subtitles to convert to SRT blocks :type subtitles: :term:`iterator` of :py:class:`Subtitle` objects :param bool reindex: Whether to reindex subtitles based on start time :param int start_index: If reindexing, the index to start reindexing from :param bool strict: Whether to enable strict mode, see :py:func:`Subtitle.to_srt` for more information :param str eol: The end of line string to use (default "\\n") :returns: A single SRT formatted string, with each input :py:class:`Subtitle` represented as an SRT block :param bool in_place: Whether to reindex subs in-place for performance (version <=1.0.0 behaviour) :rtype: str """ if reindex: subtitles = sort_and_reindex( subtitles, start_index=start_index, in_place=in_place ) return "".join(subtitle.to_srt(strict=strict, eol=eol) for subtitle in subtitles) class SRTParseError(Exception): """ Raised when part of an SRT block could not be parsed. :param int expected_start: The expected contiguous start index :param int actual_start: The actual non-contiguous start index :param str unmatched_content: The content between the expected start index and the actual start index """ def __init__(self, expected_start, actual_start, unmatched_content): message = ( "Expected contiguous start of match or end of input at char %d, " "but started at char %d (unmatched content: %r)" % (expected_start, actual_start, unmatched_content) ) super(SRTParseError, self).__init__(message) self.expected_start = expected_start self.actual_start = actual_start self.unmatched_content = unmatched_content class TimestampParseError(ValueError): """ Raised when an SRT timestamp could not be parsed. """ class _ShouldSkipException(Exception): """ Raised when a subtitle should be skipped. """ srt-3.5.3/tests/0000775000175000017500000000000014410451013015001 5ustar alexlembckealexlembckesrt-3.5.3/tests/test_srt.py0000664000175000017500000006151614410451013017233 0ustar alexlembckealexlembcke#!/usr/bin/env python # coding=utf8 from __future__ import unicode_literals from datetime import timedelta import collections import functools import os import re import string from io import StringIO import pytest from hypothesis import given, settings, HealthCheck, assume, example import hypothesis.strategies as st import srt REGISTER_SETTINGS = lambda name, **kwargs: settings.register_profile( name, suppress_health_check=[HealthCheck.too_slow], deadline=None, **kwargs ) REGISTER_SETTINGS("base") REGISTER_SETTINGS("release", max_examples=1000) settings.load_profile(os.getenv("HYPOTHESIS_PROFILE", "base")) HOURS_IN_DAY = 24 TIMEDELTA_MAX_DAYS = 999999999 CONTENTLESS_SUB = functools.partial( srt.Subtitle, index=1, start=timedelta(seconds=1), end=timedelta(seconds=2) ) def is_strictly_legal_content(content): """ Filter out things that would violate strict mode. Illegal content includes: - A content section that starts or ends with a newline - A content section that contains blank lines """ if content.strip("\r\n") != content: return False elif not content.strip(): return False elif "\n\n" in content: return False else: return True def subs_eq(got, expected, any_order=False): """ Compare Subtitle objects using vars() so that differences are easy to identify. """ got_vars = [frozenset(vars(sub).items()) for sub in got] expected_vars = [frozenset(vars(sub).items()) for sub in expected] if any_order: assert collections.Counter(got_vars) == collections.Counter(expected_vars) else: assert got_vars == expected_vars def timedeltas(min_value=0, max_value=TIMEDELTA_MAX_DAYS): """ A Hypothesis strategy to generate timedeltas. Right now {min,max}_value are shoved into multiple fields in timedelta(), which is not very customisable, but it's good enough for our current test purposes. If you need more precise control, you may need to add more parameters to this function to be able to customise more freely. """ time_unit_strategy = st.integers(min_value=min_value, max_value=max_value) timestamp_strategy = st.builds( timedelta, hours=time_unit_strategy, minutes=time_unit_strategy, seconds=time_unit_strategy, ) return timestamp_strategy def equivalent_timestamps(min_value=0, max_value=TIMEDELTA_MAX_DAYS): def string_timestamp(hours, minutes, seconds, msecs, paddings): hours, minutes, seconds, msecs = map( lambda v_and_p: "0" * v_and_p[1] + str(v_and_p[0]), zip((hours, minutes, seconds, msecs), paddings), ) return "{}:{}:{},{}".format(hours, minutes, seconds, msecs) def ts_field_value(): return st.integers(min_value=min_value, max_value=max_value) def zero_padding(): return st.integers(min_value=0, max_value=2) @st.composite def maybe_off_by_one_fields(draw): field = draw(ts_field_value()) field_maybe_plus_one = draw(st.integers(min_value=field, max_value=field + 1)) return field_maybe_plus_one, field def get_equiv_timestamps(h, m, s, ms2, ts1paddings, ts2paddings): h2, h1 = h m2, m1 = m s2, s1 = s ms1 = ( (h2 - h1) * 60 * 60 * 1000 + (m2 - m1) * 60 * 1000 + (s2 - s1) * 1000 + ms2 ) return ( string_timestamp(h2, m2, s2, ms2, ts2paddings), string_timestamp(h1, m1, s1, ms1, ts1paddings), ) return st.builds( get_equiv_timestamps, maybe_off_by_one_fields(), maybe_off_by_one_fields(), maybe_off_by_one_fields(), ts_field_value(), st.tuples(*[zero_padding() for _ in range(4)]), st.tuples(*[zero_padding() for _ in range(4)]), ) def subtitles(strict=True): """A Hypothesis strategy to generate Subtitle objects.""" # max_value settings are just to avoid overflowing TIMEDELTA_MAX_DAYS by # using arbitrary low enough numbers. # # We also skip subs with start time >= end time, so we split them into two # groups to avoid overlap. start_timestamp_strategy = timedeltas(min_value=0, max_value=500000) end_timestamp_strategy = timedeltas(min_value=500001, max_value=999999) # \r is not legal inside Subtitle.content, it should have already been # normalised to \n. content_strategy = st.text(min_size=1).filter(lambda x: "\r" not in x) proprietary_strategy = st.text().filter( lambda x: all(eol not in x for eol in "\r\n") ) if strict: content_strategy = content_strategy.filter(is_strictly_legal_content) subtitle_strategy = st.builds( srt.Subtitle, index=st.integers(min_value=0), start=start_timestamp_strategy, end=end_timestamp_strategy, proprietary=proprietary_strategy, content=content_strategy, ) return subtitle_strategy @given(st.lists(subtitles())) def test_compose_and_parse_from_file(input_subs): srt_file = StringIO(srt.compose(input_subs, reindex=False)) reparsed_subs = srt.parse(srt_file) subs_eq(reparsed_subs, input_subs) @given(st.lists(subtitles())) def test_compose_and_parse_from_file_bom(input_subs): srt_file = StringIO("\ufeff" + srt.compose(input_subs, reindex=False)) reparsed_subs = srt.parse(srt_file) subs_eq(reparsed_subs, input_subs) @given(st.lists(subtitles())) def test_compose_and_parse_strict(input_subs): composed = srt.compose(input_subs, reindex=False) reparsed_subs = srt.parse(composed) subs_eq(reparsed_subs, input_subs) @given(st.lists(subtitles())) def test_can_compose_without_ending_blank_line(input_subs): """ Many sub editors don't add a blank line to the end, and many editors accept it. We should just accept this too in input. """ composed = srt.compose(input_subs, reindex=False) composed_without_ending_blank = composed[:-1] reparsed_subs = srt.parse(composed_without_ending_blank) subs_eq(reparsed_subs, input_subs) @given(st.lists(subtitles())) def test_can_compose_without_eol_at_all(input_subs): composed = srt.compose(input_subs, reindex=False) composed_without_ending_blank = composed.rstrip("\r\n") reparsed_subs = srt.parse(composed_without_ending_blank) subs_eq(reparsed_subs, input_subs) @given(st.text().filter(is_strictly_legal_content)) def test_compose_and_parse_strict_mode(content): # sub.content should not have OS-specific line separators, only \n assume("\r" not in content) content = "\n" + content + "\n\n" + content + "\n" sub = CONTENTLESS_SUB(content=content) parsed_strict = list(srt.parse(sub.to_srt()))[0] parsed_unstrict = list(srt.parse(sub.to_srt(strict=False)))[0] # Strict mode should remove blank lines in content, leading, and trailing # newlines. assert not parsed_strict.content.startswith("\n") assert not parsed_strict.content.endswith("\n") assert "\n\n" not in parsed_strict.content # When strict mode is false, no processing should be applied to the # content (other than \r\n becoming \n). assert parsed_unstrict.content == sub.content.replace("\r\n", "\n") @given(st.integers(min_value=1, max_value=TIMEDELTA_MAX_DAYS)) def test_timedelta_to_srt_timestamp_can_go_over_24_hours(days): srt_timestamp = srt.timedelta_to_srt_timestamp(timedelta(days=days)) srt_timestamp_hours = int(srt_timestamp.split(":")[0]) assert srt_timestamp_hours == days * HOURS_IN_DAY @given(subtitles()) def test_subtitle_equality(sub_1): sub_2 = srt.Subtitle(**vars(sub_1)) assert sub_1 == sub_2 @given(subtitles()) def test_subtitle_inequality(sub_1): sub_2 = srt.Subtitle(**vars(sub_1)) sub_2.index += 1 assert sub_1 != sub_2 @given(subtitles()) def test_subtitle_from_scratch_equality(subtitle): srt_block = subtitle.to_srt() # Get two totally new sets of objects so as not to affect the hash # comparison sub_1 = list(srt.parse(srt_block))[0] sub_2 = list(srt.parse(srt_block))[0] subs_eq([sub_1], [sub_2]) # In case subs_eq and eq disagree for some reason assert sub_1 == sub_2 assert hash(sub_1) == hash(sub_2) @given(st.lists(subtitles())) def test_parsing_spaced_arrow(subs): spaced_block = srt.compose(subs, reindex=False, strict=False).replace("-->", "- >") reparsed_subtitles = srt.parse(spaced_block) subs_eq(reparsed_subtitles, subs) @given(st.lists(subtitles())) def test_parsing_spaced_ender_arrow(subs): # Seen in BSG subtitles spaced_block = srt.compose(subs, reindex=False, strict=False).replace("-->", "-- >") reparsed_subtitles = srt.parse(spaced_block) subs_eq(reparsed_subtitles, subs) @given(st.lists(subtitles())) def test_parsing_no_ws_arrow(subs): spaced_block = srt.compose(subs, reindex=False, strict=False).replace( " --> ", "-->" ) reparsed_subtitles = srt.parse(spaced_block) subs_eq(reparsed_subtitles, subs) @given(st.text(string.whitespace), st.lists(subtitles())) def test_parsing_leading_whitespace(ws, subs): prews_block = ws + srt.compose(subs, reindex=False, strict=False) reparsed_subtitles = srt.parse(prews_block) subs_eq(reparsed_subtitles, subs) @given(st.lists(subtitles())) def test_parsing_negative_index(subs): for sub in subs: sub.index *= -1 prews_block = srt.compose(subs, reindex=False, strict=False) reparsed_subtitles = srt.parse(prews_block) subs_eq(reparsed_subtitles, subs) @given(st.lists(subtitles())) def test_parsing_content_with_blank_lines(subs): for subtitle in subs: # We stuff a blank line in the middle so as to trigger the "special" # content parsing for erroneous SRT files that have blank lines. subtitle.content = subtitle.content + "\n\n" + subtitle.content reparsed_subtitles = srt.parse(srt.compose(subs, reindex=False, strict=False)) subs_eq(reparsed_subtitles, subs) @given(st.lists(subtitles())) def test_parsing_no_content(subs): for subtitle in subs: subtitle.content = "" reparsed_subtitles = srt.parse(srt.compose(subs, reindex=False, strict=False)) subs_eq(reparsed_subtitles, subs) @given(st.lists(subtitles()), st.lists(subtitles()), st.text(alphabet="\n\r\t ")) def test_subs_missing_content_removed(content_subs, contentless_subs, contentless_text): for sub in contentless_subs: sub.content = contentless_text subs = contentless_subs + content_subs composed_subs = list(srt.sort_and_reindex(subs, in_place=True)) # We should have composed the same subs as there are in content_subs, as # all contentless_subs should have been stripped. subs_eq(composed_subs, content_subs, any_order=True) # The subtitles should be reindexed starting at start_index, excluding # contentless subs default_start_index = 1 assert [sub.index for sub in composed_subs] == list( range(default_start_index, default_start_index + len(composed_subs)) ) @given( st.lists(subtitles()), st.lists(subtitles()), timedeltas(min_value=-999, max_value=-1), ) def test_subs_starts_before_zero_removed(positive_subs, negative_subs, negative_td): for sub in negative_subs: sub.start = negative_td sub.end = negative_td # Just to avoid tripping any start >= end errors subs = positive_subs + negative_subs composed_subs = list(srt.sort_and_reindex(subs, in_place=True)) # There should be no negative subs subs_eq(composed_subs, positive_subs, any_order=True) @given(st.lists(subtitles(), min_size=1), st.integers(min_value=0)) def test_sort_and_reindex(input_subs, start_index): for sub in input_subs: # Pin all subs to same end time and index so that start time is # compared only, must be guaranteed to be < sub.start, see how # start_timestamp_strategy is done sub.end = timedelta(500001) sub.index = 1 reindexed_subs = list( srt.sort_and_reindex(input_subs, start_index=start_index, in_place=True) ) # The subtitles should be reindexed starting at start_index assert [sub.index for sub in reindexed_subs] == list( range(start_index, start_index + len(input_subs)) ) # The subtitles should be sorted by start time expected_sorting = sorted(input_subs, key=lambda sub: sub.start) assert reindexed_subs == expected_sorting @given(st.lists(subtitles())) def test_sort_and_reindex_no_skip(input_subs): # end time > start time should not trigger a skip if skip=False for sub in input_subs: old_start = sub.start sub.start = sub.end sub.end = old_start reindexed_subs = list(srt.sort_and_reindex(input_subs, skip=False)) # Nothing should have been skipped assert len(reindexed_subs) == len(input_subs) @given(st.lists(subtitles())) def test_sort_and_reindex_handles_no_index(input_subs): # end time > start time should not trigger a skip if skip=False for sub in input_subs: old_start = sub.start sub.start = sub.end sub.end = old_start sub.index = None reindexed_subs = list(srt.sort_and_reindex(input_subs)) # Everything should have been skipped assert not reindexed_subs @given(st.lists(subtitles(), min_size=1)) def test_sort_and_reindex_same_start_time_uses_end(input_subs): for sub in input_subs: # Pin all subs to same start time and index so that end time is # compared only sub.start = timedelta(1) sub.index = 1 reindexed_subs = list(srt.sort_and_reindex(input_subs, in_place=True)) # The subtitles should be sorted by end time when start time is the same expected_sorting = sorted(input_subs, key=lambda sub: sub.end) assert reindexed_subs == expected_sorting @given(st.lists(subtitles(), min_size=1)) def test_sort_and_reindex_same_start_and_end_time_uses_index(input_subs): for sub in input_subs: # Pin all subs to same start and end time so that index is compared # only sub.start = timedelta(1) sub.end = timedelta(2) reindexed_subs = list(srt.sort_and_reindex(input_subs, in_place=True)) # The subtitles should be sorted by index when start and end time are the # same expected_sorting = sorted(input_subs, key=lambda sub: sub.index) assert reindexed_subs == expected_sorting @given(st.lists(subtitles(), min_size=1), st.integers(min_value=0)) def test_sort_and_reindex_not_in_place_matches(input_subs, start_index): # Make copies for both sort_and_reindex calls so that they can't affect # each other not_in_place_subs = [srt.Subtitle(**vars(sub)) for sub in input_subs] in_place_subs = [srt.Subtitle(**vars(sub)) for sub in input_subs] nip_ids = [id(sub) for sub in not_in_place_subs] ip_ids = [id(sub) for sub in in_place_subs] not_in_place_output = list( srt.sort_and_reindex(not_in_place_subs, start_index=start_index) ) in_place_output = list( srt.sort_and_reindex(in_place_subs, start_index=start_index, in_place=True) ) # The results in each case should be the same subs_eq(not_in_place_output, in_place_output) # Not in place sort_and_reindex should have created new subs assert not any(id(sub) in nip_ids for sub in not_in_place_output) # In place sort_and_reindex should be reusing the same subs assert all(id(sub) in ip_ids for sub in in_place_output) @given( st.lists(subtitles(), min_size=1), st.integers(min_value=0), st.text(min_size=1), timedeltas(), ) def test_parser_noncontiguous(subs, fake_idx, garbage, fake_timedelta): composed = srt.compose(subs) # Put some garbage between subs that should trigger our failed parsing # detection. Since we do some magic to try and detect blank lines that # don't really delimit subtitles, it has to look at least a little like an # SRT block. srt_timestamp = srt.timedelta_to_srt_timestamp(fake_timedelta) composed = composed.replace( "\n\n", "\n\n%d\n%s %s" % (fake_idx, srt_timestamp, garbage) ) with pytest.raises(srt.SRTParseError): list(srt.parse(composed)) @given( st.lists(subtitles(), min_size=1), st.integers(min_value=0), st.text(min_size=1), timedeltas(), ) def test_parser_noncontiguous_ignore_errors(subs, fake_idx, garbage, fake_timedelta): composed = srt.compose(subs) srt_timestamp = srt.timedelta_to_srt_timestamp(fake_timedelta) composed = composed.replace( "\n\n", "\n\n%d\n%s %s" % (fake_idx, srt_timestamp, garbage) ) # Should not raise, we have ignore_errors list(srt.parse(composed, ignore_errors=True)) def _parseable_as_int(text): try: int(text) except ValueError: return False return True def _parseable_as_float(text): try: float(text) except ValueError: return False return True @given(st.lists(subtitles()), st.text(min_size=1)) def test_parser_noncontiguous_leading(subs, garbage): # Issue #50 permits leading whitespace, see test_parsing_leading_whitespace assume(not garbage.isspace()) # Issue #56 permits negative indexes, see test_parsing_negative_index. It # also shouldn't just be a number, because then we'd confuse it with our # index... assume(garbage.strip()[0] != ".") assume(garbage.strip()[0] != "-") assume(not _parseable_as_int(garbage.strip())) assume(not _parseable_as_float(garbage.strip())) # Put some garbage at the beginning that should trigger our noncontiguity # checks composed = garbage + srt.compose(subs) with pytest.raises(srt.SRTParseError): list(srt.parse(composed)) @given( st.lists(subtitles(), min_size=1), st.integers(min_value=0), st.text(min_size=1), timedeltas(), ) def test_parser_didnt_match_to_end_raises(subs, fake_idx, garbage, fake_timedelta): srt_blocks = [sub.to_srt() for sub in subs] srt_timestamp = srt.timedelta_to_srt_timestamp(fake_timedelta) garbage = "\n\n%d\n%s %s" % (fake_idx, srt_timestamp, garbage) srt_blocks.append(garbage) composed = "".join(srt_blocks) with pytest.raises(srt.SRTParseError) as thrown_exc: list(srt.parse(composed)) # Since we will consume as many \n as needed until we meet the lookahead # assertion, leading newlines in `garbage` will be stripped. garbage_stripped = garbage.lstrip("\n") assert garbage_stripped == thrown_exc.value.unmatched_content assert len(composed) - len(garbage_stripped) == thrown_exc.value.expected_start assert len(composed) == thrown_exc.value.actual_start @given(st.lists(subtitles())) def test_parser_can_parse_with_dot_msec_delimiter(subs): original_srt_blocks = [sub.to_srt() for sub in subs] dot_srt_blocks = [] for srt_block in original_srt_blocks: srt_lines = srt_block.split("\n") # We should only do the first two, as it might also be in the # proprietary metadata, causing this test to fail. dot_timestamp = srt_lines[1].replace(",", ".", 2) srt_lines[1] = dot_timestamp dot_srt_blocks.append("\n".join(srt_lines)) composed_with_dots = "".join(dot_srt_blocks) reparsed_subs = srt.parse(composed_with_dots) subs_eq(reparsed_subs, subs) @given(st.lists(subtitles())) def test_parser_can_parse_with_fullwidth_delimiter(subs): original_srt_blocks = [sub.to_srt() for sub in subs] dot_srt_blocks = [] for srt_block in original_srt_blocks: srt_lines = srt_block.split("\n") dot_timestamp = srt_lines[1].replace(",", "锛", 1).replace(":", "锛", 1) srt_lines[1] = dot_timestamp dot_srt_blocks.append("\n".join(srt_lines)) composed_with_fullwidth = "".join(dot_srt_blocks) reparsed_subs = srt.parse(composed_with_fullwidth) subs_eq(reparsed_subs, subs) @given(st.lists(subtitles())) def test_parser_can_parse_with_no_msec(subs): original_srt_blocks = [sub.to_srt() for sub in subs] srt_blocks = [] for srt_block in original_srt_blocks: srt_lines = srt_block.split("\n") # We should only do the first two, as it might also be in the # proprietary metadata, causing this test to fail. srt_lines[1] = re.sub(",[0-9]+", "", srt_lines[1], 2) srt_blocks.append("\n".join(srt_lines)) composed = "".join(srt_blocks) reparsed_subs = srt.parse(composed) subs_eq(reparsed_subs, subs) @given(subtitles()) def test_repr_doesnt_crash(sub): # Not much we can do here, but we should make sure __repr__ doesn't crash # or anything and it does at least vaguely look like what we want assert "Subtitle" in repr(sub) assert str(sub.index) in repr(sub) @given(subtitles(), subtitles()) def test_parser_accepts_final_no_newline_no_content(sub1, sub2): # Limit size so we know how much to remove sub2.content = "" subs = [sub1, sub2] # Remove the last newlines so that there are none. Cannot use rstrip since # there might be other stuff that gets matched in proprietary stripped_srt_blocks = srt.compose(subs, reindex=False)[:-2] reparsed_subs = srt.parse(stripped_srt_blocks) subs_eq(reparsed_subs, subs) @given(st.lists(subtitles())) def test_parser_accepts_newline_no_content(subs): for sub in subs: # Limit size so we know how many lines to remove sub.content = "" # Remove the last \n so that there is only one stripped_srt_blocks = "".join(sub.to_srt()[:-1] for sub in subs) reparsed_subs = srt.parse(stripped_srt_blocks) subs_eq(reparsed_subs, subs) @given(st.lists(subtitles())) def test_compose_and_parse_strict_crlf(input_subs): composed_raw = srt.compose(input_subs, reindex=False) composed = composed_raw.replace("\n", "\r\n") reparsed_subs = list(srt.parse(composed)) for sub in reparsed_subs: sub.content = sub.content.replace("\r\n", "\n") subs_eq(reparsed_subs, input_subs) @given(st.lists(subtitles()), st.one_of(st.just("\n"), st.just("\r\n"))) def test_compose_and_parse_strict_custom_eol(input_subs, eol): composed = srt.compose(input_subs, reindex=False, eol=eol) reparsed_subs = srt.parse(composed) subs_eq(reparsed_subs, input_subs) @given(equivalent_timestamps()) def test_equal_timestamps_despite_different_fields_parsed_as_equal(timestamps): ts1, ts2 = timestamps assert srt.srt_timestamp_to_timedelta(ts1) == srt.srt_timestamp_to_timedelta(ts2) @given(timedeltas()) def test_bad_timestamp_format_raises(ts): ts = srt.timedelta_to_srt_timestamp(ts) ts = ts.replace(":", "t", 1) with pytest.raises(srt.TimestampParseError): srt.srt_timestamp_to_timedelta(ts) @given(st.lists(subtitles()), st.lists(st.sampled_from(string.whitespace))) def test_can_parse_index_trailing_ws(input_subs, whitespace): out = "" for sub in input_subs: lines = sub.to_srt().split("\n") lines[0] = lines[0] + "".join(whitespace) out += "\n".join(lines) reparsed_subs = srt.parse(out) subs_eq(reparsed_subs, input_subs) @given(st.lists(subtitles())) def test_can_parse_index_with_dot(input_subs): # Seen in Battlestar Galactica subs out = "" for sub in input_subs: lines = sub.to_srt().split("\n") lines[0] = lines[0] + "." + lines[0] out += "\n".join(lines) reparsed_subs = srt.parse(out) subs_eq(reparsed_subs, input_subs) @given(st.lists(subtitles()), st.lists(st.just("0"))) def test_can_parse_index_leading_zeroes(input_subs, zeroes): out = "" for sub in input_subs: lines = sub.to_srt().split("\n") lines[0] = "".join(zeroes) + lines[0] out += "\n".join(lines) reparsed_subs = srt.parse(out) subs_eq(reparsed_subs, input_subs) @given(st.lists(subtitles(), min_size=1)) def test_parse_file_with_missing_index(input_subs): # cf. issue #51 for sub in input_subs: try: int(sub.content.strip().split("\n")[-1]) except ValueError: pass else: # If the final line with actual content is a number, we'll parse it # as the index, so ignore that assume(False) out_no_index = "" out_zero_index = "" for sub in input_subs: block = sub.to_srt() block = block[block.index("\n") + 1 :] out_no_index += block input_subs_copy = [srt.Subtitle(**vars(sub)) for sub in input_subs] for sub in input_subs_copy: # sub.index == None will get rendered in to_srt as 0 sub.index = 0 out_zero_index += sub.to_srt() subs_no_index = list(srt.parse(out_no_index)) subs_zero_index = list(srt.parse(out_zero_index)) # One should have index == None, one should have index == 0... assert subs_no_index assert subs_zero_index assert all(sub.index == None for sub in subs_no_index) assert all(sub.index == 0 for sub in subs_zero_index) assert subs_no_index != subs_zero_index # ...but they should render the same... assert srt.compose(subs_no_index, reindex=False) == srt.compose( subs_zero_index, reindex=False ) # ...and sort the same. assert srt.compose(subs_no_index) == srt.compose(subs_zero_index) srt-3.5.3/tests/requirements.txt0000664000175000017500000000037714410451013020274 0ustar alexlembckealexlembckepytest==4.*; python_version < '3.0' pytest==6.*; python_version >= '3.0' pytest-xdist==1.*; python_version < '3.0' pytest-xdist==2.*; python_version >= '3.0' pytest-cov==2.* hypothesis==4.*; python_version < '3.6' hypothesis==6.*; python_version >= '3.6' srt-3.5.3/LICENSE0000664000175000017500000000207514410451013014650 0ustar alexlembckealexlembckeThe MIT License Copyright (c) 2014-present Christopher Down Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. srt-3.5.3/docs/0000775000175000017500000000000014410451013014567 5ustar alexlembckealexlembckesrt-3.5.3/docs/quickstart.rst0000664000175000017500000000213014410451013017507 0ustar alexlembckealexlembckeQuickstart ========== Parse an SRT to Python objects ------------------------------ .. code:: python >>> import srt >>> subtitle_generator = srt.parse('''\ ... 1 ... 00:31:37,894 --> 00:31:39,928 ... OK, look, I think I have a plan here. ... ... 2 ... 00:31:39,931 --> 00:31:41,931 ... Using mainly spoons, ... ... 3 ... 00:31:41,933 --> 00:31:43,435 ... we dig a tunnel under the city and release it into the wild. ... ... ''') >>> subtitles = list(subtitle_generator) >>> >>> subtitles[0].start datetime.timedelta(0, 1897, 894000) >>> subtitles[1].content 'Using mainly spoons,' Compose an SRT from Python objects ---------------------------------- .. code:: python >>> print(srt.compose(subtitles)) 1 00:31:37,894 --> 00:31:39,928 OK, look, I think I have a plan here. 2 00:31:39,931 --> 00:31:41,931 Using mainly spoons, 3 00:31:41,933 --> 00:31:43,435 we dig a tunnel under the city and release it into the wild. srt-3.5.3/docs/api.rst0000664000175000017500000000010614410451013016067 0ustar alexlembckealexlembckeAPI documentation ================= .. automodule:: srt :members: srt-3.5.3/docs/conf.py0000664000175000017500000000165314410451013016073 0ustar alexlembckealexlembckeimport sys import os # srt.py is in the next directory up sys.path.insert(0, os.path.abspath("..")) extensions = ["sphinx.ext.autodoc", "sphinx.ext.intersphinx", "sphinx.ext.doctest"] copyright = "Chris Down" exclude_patterns = ["_build"] master_doc = "index" project = "srt" pygments_style = "sphinx" source_suffix = ".rst" templates_path = ["_templates"] version = "3.5.3" release = version html_static_path = ["_static"] html_theme = "alabaster" htmlhelp_basename = "srtdoc" latex_elements = {} latex_documents = [("index", "srt.tex", "srt Documentation", "Chris Down", "manual")] man_pages = [("index", "srt", "srt Documentation", ["Chris Down"], 1)] texinfo_documents = [ ( "index", "srt", "srt Documentation", "Chris Down", "srt", "One line description of project.", "Miscellaneous", ) ] intersphinx_mapping = {"python": ("https://docs.python.org/3.8", None)} srt-3.5.3/docs/index.rst0000664000175000017500000000050614410451013016431 0ustar alexlembckealexlembcke`srt`: Parse SubRip files ========================= srt_ is a tiny Python library for parsing, modifying, and composing SRT files. .. _srt: https://github.com/cdown/srt Documentation ============= .. toctree:: :maxdepth: 2 quickstart api Indices and tables ================== * :ref:`genindex` * :ref:`search` srt-3.5.3/docs/requirements.txt0000664000175000017500000000001414410451013020046 0ustar alexlembckealexlembckesphinx==3.* srt-3.5.3/README.rst0000664000175000017500000001351614410451013015334 0ustar alexlembckealexlembcke|ghactions| |coveralls| .. |ghactions| image:: https://img.shields.io/github/actions/workflow/status/cdown/srt/ci.yml?branch=develop :target: https://github.com/cdown/srt/actions?query=branch%3Adevelop :alt: Tests .. |coveralls| image:: https://img.shields.io/coveralls/cdown/srt/develop.svg?label=test%20coverage :target: https://coveralls.io/github/cdown/srt?branch=develop :alt: Coverage srt is a tiny but featureful Python library for parsing, modifying, and composing `SRT files`_. Take a look at the quickstart_ for a basic overview of the library. `Detailed API documentation`_ is also available. Want to see some examples of its use? Take a look at the `tools shipped with the library`_. This library is also used internally by projects like `subsync`_, `NVIDIA RAD-TTS`_, `manim`_, `kinobot`_, `bw_plex`_, and many more. .. _subsync: https://github.com/smacke/subsync .. _`NVIDIA RAD-TTS`: https://github.com/NVIDIA/radtts .. _bw_plex: https://github.com/Hellowlol/bw_plex .. _manim: https://github.com/ManimCommunity/manim .. _kinobot: https://github.com/vitiko98/kinobot Why choose this library? ------------------------ - Can parse many broken SRT files which other SRT libraries cannot, and fix them - Extremely lightweight, ~200 lines of code excluding docstrings - Simple, intuitive API - High quality test suite using Hypothesis_ - `100% test coverage`_ (including branches) - `Well documented API`_, at both a high and low level - `~30% faster than pysrt on typical workloads`_ - Full support for `PyPy`_ - No dependencies outside of the standard library - Tolerant of many common errors found in real-world SRT files - Support for Asian-style SRT formats (ie. "fullwidth" SRT format) - Completely Unicode compliant - Released under a highly permissive license (MIT) - Real world tested 鈥 used in production to process thousands of SRT files every day - Portable 鈥 runs on Linux, OSX, and Windows - Tools included 鈥 contains lightweight tools to perform generic tasks with the library .. _quickstart: http://srt.readthedocs.org/en/latest/quickstart.html .. _`SRT files`: https://en.wikipedia.org/wiki/SubRip#SubRip_text_file_format .. _Hypothesis: https://github.com/DRMacIver/hypothesis .. _`100% test coverage`: https://coveralls.io/github/cdown/srt?branch=develop .. _`Well documented API`: http://srt.readthedocs.org/en/latest/index.html .. _PyPy: http://pypy.org/ .. _`~30% faster than pysrt on typical workloads`: https://paste.pound-python.org/raw/8nQKbDW0ROWvS7bOeAb3/ Usage ----- Tools ===== There are a number of `tools shipped with the library`_ to manipulate, process, and fix SRT files. Here's an example using `hanzidentifier`_ to strip out non-Chinese lines: .. code:: $ cat pe.srt 1 00:00:33,843 --> 00:00:38,097 Only 3% of the water on our planet is fresh. 鍦扮悆涓婂彧鏈3%鐨勬按鏄贰姘 2 00:00:40,641 --> 00:00:44,687 Yet, these precious waters are rich with surprise. 鍙槸杩欎簺鐝嶈吹鐨勬贰姘翠腑鍗村厖婊′簡鎯婂 $ srt lines-matching -m hanzidentifier -f hanzidentifier.has_chinese -i pe.srt 1 00:00:33,843 --> 00:00:38,097 鍦扮悆涓婂彧鏈3%鐨勬按鏄贰姘 2 00:00:40,641 --> 00:00:44,687 鍙槸杩欎簺鐝嶈吹鐨勬贰姘翠腑鍗村厖婊′簡鎯婂 These tools are easy to chain together, for example, say you have one subtitle with Chinese and English, and other with French, but you want Chinese and French only. Oh, and the Chinese one is 5 seconds later than it should be. That's easy enough to sort out: .. code:: $ srt lines-matching -m hanzidentifier -f hanzidentifier.has_chinese -i chs+eng.srt | > srt fixed-timeshift --seconds -5 | > srt mux --input - --input fra.srt See the srt_tools/ directory for more information. .. _hanzidentifier: https://github.com/tsroten/hanzidentifier Library ======= `Detailed API documentation`_ is available, but here are the basics. Here's how you convert SRT input to Subtitle objects which you can manipulate: .. code:: python >>> data = '''\ 1 00:00:33,843 --> 00:00:38,097 鍦扮悆涓婂彧鏈3%鐨勬按鏄贰姘 2 00:00:40,641 --> 00:00:44,687 鍙槸杩欎簺鐝嶈吹鐨勬贰姘翠腑鍗村厖婊′簡鎯婂 3 00:00:57,908 --> 00:01:03,414 鎵鏈夐檰鍦扮敓鍛藉綊鏍圭粨搴曢兘渚濊禆鏂兼贰姘 ''' >>> for sub in srt.parse(data): ... print(sub) Subtitle(index=1, start=datetime.timedelta(seconds=33, microseconds=843000), end=datetime.timedelta(seconds=38, microseconds=97000), content='鍦扮悆涓婂彧鏈3%鐨勬按鏄贰姘', proprietary='') Subtitle(index=2, start=datetime.timedelta(seconds=40, microseconds=641000), end=datetime.timedelta(seconds=44, microseconds=687000), content='鍙槸杩欎簺鐝嶈吹鐨勬贰姘翠腑鍗村厖婊′簡鎯婂', proprietary='') Subtitle(index=3, start=datetime.timedelta(seconds=57, microseconds=908000), end=datetime.timedelta(seconds=63, microseconds=414000), content='鎵鏈夐檰鍦扮敓鍛藉綊鏍圭粨搴曢兘渚濊禆鏂兼贰姘', proprietary='') And here's how you go back from Subtitle objects to SRT output: .. code:: python >>> subs = list(srt.parse(data)) >>> subs[1].content = "Changing subtitle data is easy!" >>> print(srt.compose(subs)) 1 00:00:33,843 --> 00:00:38,097 鍦扮悆涓婂彧鏈3%鐨勬按鏄贰姘 2 00:00:40,641 --> 00:00:44,687 Changing subtitle data is easy! 3 00:00:57,908 --> 00:01:03,414 鎵鏈夐檰鍦扮敓鍛藉綊鏍圭粨搴曢兘渚濊禆鏂兼贰姘 Installation ------------ To install the latest stable version from PyPi: .. code:: pip install -U srt To install the latest development version directly from GitHub: .. code:: pip install -U git+https://github.com/cdown/srt.git@develop Testing ------- .. code:: tox .. _Tox: https://tox.readthedocs.org .. _`Detailed API documentation`: http://srt.readthedocs.org/en/latest/api.html .. _`tools shipped with the library`: https://github.com/cdown/srt/tree/develop/srt_tools srt-3.5.3/.coveragerc0000664000175000017500000000003414410451013015755 0ustar alexlembckealexlembcke[run] relative_files = True