From 69c5a46016b63c30587287e6bfdbc909be8a96ff Mon Sep 17 00:00:00 2001 From: Liam Sy Date: Sun, 2 Oct 2022 20:39:15 -0400 Subject: [PATCH 1/9] added class SRTFormatter --- youtube_transcript_api/formatters.py | 53 ++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/youtube_transcript_api/formatters.py b/youtube_transcript_api/formatters.py index 986044f..557ec10 100644 --- a/youtube_transcript_api/formatters.py +++ b/youtube_transcript_api/formatters.py @@ -134,6 +134,58 @@ class WebVTTFormatter(Formatter): """ return '\n\n\n'.join([self.format_transcript(transcript, **kwargs) for transcript in transcripts]) +class SRTFormatter(Formatter): + def _seconds_to_timestamp(self, time): + """Helper that converts `time` into a transcript cue timestamp for SRT. + + :param time: a float representing time in seconds. + :type time: float + :return: a string formatted as a cue timestamp, 'HH:MM:SS,MS' + :rtype str + :example: + >>> self._seconds_to_timestamp(6.93) + '00:00:06,930' + """ + time = float(time) + hours, remainder = divmod(time, 3600) + mins, secs = divmod(remainder, 60) + ms = int(round((time - int(time))*1000, 2)) + return "{:02.0f}:{:02.0f}:{:02.0f},{:03d}".format(hours, mins, secs, ms) + + def format_transcript(self, transcript, **kwargs): + """Converts a transcript into SRT formatting. + + :param transcript: + :reference: https://www.3playmedia.com/blog/create-srt-file/ + """ + lines = [] + for i, line in enumerate(transcript): + if i < len(transcript) - 1: + # Looks ahead, use next start time since duration value + # would create an overlap between start times. + time_text = "{} --> {}".format( + self._seconds_to_timestamp(line['start']), + self._seconds_to_timestamp(transcript[i + 1]['start']) + ) + else: + # Reached the end, cannot look ahead, use duration now. + duration = line['start'] + line['duration'] + time_text = "{} --> {}".format( + self._seconds_to_timestamp(line['start']), + self._seconds_to_timestamp(duration) + ) + lines.append("{}\n{}\n{}".format(i + 1, time_text, line['text'])) + + return "\n\n".join(lines) + "\n" + + def format_transcripts(self, transcripts, **kwargs): + """Converts a list of transcripts into SRT formatting. + + :param transcript: + :reference: https://www.3playmedia.com/blog/create-srt-file/ + """ + return '\n\n\n'.join([self.format_transcript(transcript, **kwargs) for transcript in transcripts]) + class FormatterLoader(object): TYPES = { @@ -141,6 +193,7 @@ class FormatterLoader(object): 'pretty': PrettyPrintFormatter, 'text': TextFormatter, 'webvtt': WebVTTFormatter, + 'srt' : SRTFormatter, } class UnknownFormatterType(Exception): From 399b37a22d4b4dd1f830dca92c9526290321f32b Mon Sep 17 00:00:00 2001 From: Liam Sy <79372899+liamrs222@users.noreply.github.com> Date: Sun, 2 Oct 2022 22:54:56 -0400 Subject: [PATCH 2/9] Create codeql-analysis.yml --- .github/workflows/codeql-analysis.yml | 74 +++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 .github/workflows/codeql-analysis.yml diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml new file mode 100644 index 0000000..780d0c7 --- /dev/null +++ b/.github/workflows/codeql-analysis.yml @@ -0,0 +1,74 @@ +# For most projects, this workflow file will not need changing; you simply need +# to commit it to your repository. +# +# You may wish to alter this file to override the set of languages analyzed, +# or to provide custom queries or build logic. +# +# ******** NOTE ******** +# We have attempted to detect the languages in your repository. Please check +# the `language` matrix defined below to confirm you have the correct set of +# supported CodeQL languages. +# +name: "CodeQL" + +on: + push: + branches: [ "master" ] + pull_request: + # The branches below must be a subset of the branches above + branches: [ "master" ] + schedule: + - cron: '17 1 * * 2' + +jobs: + analyze: + name: Analyze + runs-on: ubuntu-latest + permissions: + actions: read + contents: read + security-events: write + + strategy: + fail-fast: false + matrix: + language: [ 'python' ] + # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] + # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + # Initializes the CodeQL tools for scanning. + - name: Initialize CodeQL + uses: github/codeql-action/init@v2 + with: + languages: ${{ matrix.language }} + # If you wish to specify custom queries, you can do so here or in a config file. + # By default, queries listed here will override any specified in a config file. + # Prefix the list here with "+" to use these queries and those in the config file. + + # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs + # queries: security-extended,security-and-quality + + + # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). + # If this step fails, then you should remove it and run the build manually (see below) + - name: Autobuild + uses: github/codeql-action/autobuild@v2 + + # â„šī¸ Command-line programs to run using the OS shell. + # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun + + # If the Autobuild fails above, remove it and uncomment the following three lines. + # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance. + + # - run: | + # echo "Run, Build Application using script" + # ./location_of_script_within_repo/buildscript.sh + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v2 + with: + category: "/language:${{matrix.language}}" From 0dea8dfbaf3cf4c72c89489e90516c02e22abd9a Mon Sep 17 00:00:00 2001 From: Liam Sy <79372899+liamrs222@users.noreply.github.com> Date: Sun, 2 Oct 2022 22:55:22 -0400 Subject: [PATCH 3/9] Create SECURITY.md --- SECURITY.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 SECURITY.md diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000..034e848 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,21 @@ +# Security Policy + +## Supported Versions + +Use this section to tell people about which versions of your project are +currently being supported with security updates. + +| Version | Supported | +| ------- | ------------------ | +| 5.1.x | :white_check_mark: | +| 5.0.x | :x: | +| 4.0.x | :white_check_mark: | +| < 4.0 | :x: | + +## Reporting a Vulnerability + +Use this section to tell people how to report a vulnerability. + +Tell them where to go, how often they can expect to get an update on a +reported vulnerability, what to expect if the vulnerability is accepted or +declined, etc. From 6cbb5c6ba7c242801291ae3ba0c025ba3c463615 Mon Sep 17 00:00:00 2001 From: Liam Sy <79372899+liamrs222@users.noreply.github.com> Date: Fri, 14 Oct 2022 18:45:39 -0400 Subject: [PATCH 4/9] Delete SECURITY.md --- SECURITY.md | 21 --------------------- 1 file changed, 21 deletions(-) delete mode 100644 SECURITY.md diff --git a/SECURITY.md b/SECURITY.md deleted file mode 100644 index 034e848..0000000 --- a/SECURITY.md +++ /dev/null @@ -1,21 +0,0 @@ -# Security Policy - -## Supported Versions - -Use this section to tell people about which versions of your project are -currently being supported with security updates. - -| Version | Supported | -| ------- | ------------------ | -| 5.1.x | :white_check_mark: | -| 5.0.x | :x: | -| 4.0.x | :white_check_mark: | -| < 4.0 | :x: | - -## Reporting a Vulnerability - -Use this section to tell people how to report a vulnerability. - -Tell them where to go, how often they can expect to get an update on a -reported vulnerability, what to expect if the vulnerability is accepted or -declined, etc. From 037058171eb727f26b4ec91a05da3d2ba42631c2 Mon Sep 17 00:00:00 2001 From: Liam Sy <79372899+liamrs222@users.noreply.github.com> Date: Fri, 14 Oct 2022 18:45:57 -0400 Subject: [PATCH 5/9] Delete codeql-analysis.yml --- .github/workflows/codeql-analysis.yml | 74 --------------------------- 1 file changed, 74 deletions(-) delete mode 100644 .github/workflows/codeql-analysis.yml diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml deleted file mode 100644 index 780d0c7..0000000 --- a/.github/workflows/codeql-analysis.yml +++ /dev/null @@ -1,74 +0,0 @@ -# For most projects, this workflow file will not need changing; you simply need -# to commit it to your repository. -# -# You may wish to alter this file to override the set of languages analyzed, -# or to provide custom queries or build logic. -# -# ******** NOTE ******** -# We have attempted to detect the languages in your repository. Please check -# the `language` matrix defined below to confirm you have the correct set of -# supported CodeQL languages. -# -name: "CodeQL" - -on: - push: - branches: [ "master" ] - pull_request: - # The branches below must be a subset of the branches above - branches: [ "master" ] - schedule: - - cron: '17 1 * * 2' - -jobs: - analyze: - name: Analyze - runs-on: ubuntu-latest - permissions: - actions: read - contents: read - security-events: write - - strategy: - fail-fast: false - matrix: - language: [ 'python' ] - # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] - # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support - - steps: - - name: Checkout repository - uses: actions/checkout@v3 - - # Initializes the CodeQL tools for scanning. - - name: Initialize CodeQL - uses: github/codeql-action/init@v2 - with: - languages: ${{ matrix.language }} - # If you wish to specify custom queries, you can do so here or in a config file. - # By default, queries listed here will override any specified in a config file. - # Prefix the list here with "+" to use these queries and those in the config file. - - # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs - # queries: security-extended,security-and-quality - - - # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). - # If this step fails, then you should remove it and run the build manually (see below) - - name: Autobuild - uses: github/codeql-action/autobuild@v2 - - # â„šī¸ Command-line programs to run using the OS shell. - # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun - - # If the Autobuild fails above, remove it and uncomment the following three lines. - # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance. - - # - run: | - # echo "Run, Build Application using script" - # ./location_of_script_within_repo/buildscript.sh - - - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v2 - with: - category: "/language:${{matrix.language}}" From 68ca703ae0b7aeef4e7c38166cb5165261f99505 Mon Sep 17 00:00:00 2001 From: Liam Sy Date: Thu, 20 Oct 2022 17:25:31 -0400 Subject: [PATCH 6/9] updated WebVTT and SRT formatters --- youtube_transcript_api/formatters.py | 122 ++++++++++----------------- 1 file changed, 46 insertions(+), 76 deletions(-) diff --git a/youtube_transcript_api/formatters.py b/youtube_transcript_api/formatters.py index 557ec10..387e565 100644 --- a/youtube_transcript_api/formatters.py +++ b/youtube_transcript_api/formatters.py @@ -79,8 +79,19 @@ class TextFormatter(Formatter): """ return '\n\n\n'.join([self.format_transcript(transcript, **kwargs) for transcript in transcripts]) +class _TextBasedFormatter(TextFormatter): + def _format_timestamp(self, hours, mins, secs, ms): + raise NotImplementedError('A subclass of _TextBasedFormatter must implement ' \ + 'their own .format_timestamp() method.') -class WebVTTFormatter(Formatter): + def _format_transcript_header(self, lines): + raise NotImplementedError('A subclass of _TextBasedFormatter must implement ' \ + 'their own _format_transcript_header method.') + + def _format_transcript_helper(self, i, time_text, line): + raise NotImplementedError('A subclass of _TextBasedFormatter must implement ' \ + 'their own _format_transcript_helper method.') + def _seconds_to_timestamp(self, time): """Helper that converts `time` into a transcript cue timestamp. @@ -95,96 +106,55 @@ class WebVTTFormatter(Formatter): '00:00:06.930' """ time = float(time) - hours, remainder = divmod(time, 3600) - mins, secs = divmod(remainder, 60) + hours_float, remainder = divmod(time, 3600) + mins_float, secs_float = divmod(remainder, 60) + hours, mins, secs = int(hours_float), int(mins_float), int(secs_float) ms = int(round((time - int(time))*1000, 2)) - return "{:02.0f}:{:02.0f}:{:02.0f}.{:03d}".format(hours, mins, secs, ms) + return self._format_timestamp(hours, mins, secs, ms) def format_transcript(self, transcript, **kwargs): - """A basic implementation of WEBVTT formatting. + """A basic implementation of WEBVTT/SRT formatting. :param transcript: - :reference: https://www.w3.org/TR/webvtt1/#introduction-caption + :reference: + https://www.w3.org/TR/webvtt1/#introduction-caption + https://www.3playmedia.com/blog/create-srt-file/ """ lines = [] for i, line in enumerate(transcript): - if i < len(transcript) - 1: - # Looks ahead, use next start time since duration value - # would create an overlap between start times. - time_text = "{} --> {}".format( - self._seconds_to_timestamp(line['start']), - self._seconds_to_timestamp(transcript[i + 1]['start']) + end = line['start'] + line['duration'] + time_text = "{} --> {}".format( + self._seconds_to_timestamp(line['start']), + self._seconds_to_timestamp( + transcript[i + 1]['start'] + if i < len(transcript) - 1 and transcript[i + 1]['start'] < end else end ) - else: - # Reached the end, cannot look ahead, use duration now. - duration = line['start'] + line['duration'] - time_text = "{} --> {}".format( - self._seconds_to_timestamp(line['start']), - self._seconds_to_timestamp(duration) - ) - lines.append("{}\n{}".format(time_text, line['text'])) + ) + lines.append(self._format_transcript_helper(i, time_text, line)) - return "WEBVTT\n\n" + "\n\n".join(lines) + "\n" + return self._format_transcript_header(lines) - def format_transcripts(self, transcripts, **kwargs): - """A basic implementation of WEBVTT formatting for a list of transcripts. - - :param transcripts: - :reference: https://www.w3.org/TR/webvtt1/#introduction-caption - """ - return '\n\n\n'.join([self.format_transcript(transcript, **kwargs) for transcript in transcripts]) - -class SRTFormatter(Formatter): - def _seconds_to_timestamp(self, time): - """Helper that converts `time` into a transcript cue timestamp for SRT. - - :param time: a float representing time in seconds. - :type time: float - :return: a string formatted as a cue timestamp, 'HH:MM:SS,MS' - :rtype str - :example: - >>> self._seconds_to_timestamp(6.93) - '00:00:06,930' - """ - time = float(time) - hours, remainder = divmod(time, 3600) - mins, secs = divmod(remainder, 60) - ms = int(round((time - int(time))*1000, 2)) - return "{:02.0f}:{:02.0f}:{:02.0f},{:03d}".format(hours, mins, secs, ms) - - def format_transcript(self, transcript, **kwargs): - """Converts a transcript into SRT formatting. - - :param transcript: - :reference: https://www.3playmedia.com/blog/create-srt-file/ - """ - lines = [] - for i, line in enumerate(transcript): - if i < len(transcript) - 1: - # Looks ahead, use next start time since duration value - # would create an overlap between start times. - time_text = "{} --> {}".format( - self._seconds_to_timestamp(line['start']), - self._seconds_to_timestamp(transcript[i + 1]['start']) - ) - else: - # Reached the end, cannot look ahead, use duration now. - duration = line['start'] + line['duration'] - time_text = "{} --> {}".format( - self._seconds_to_timestamp(line['start']), - self._seconds_to_timestamp(duration) - ) - lines.append("{}\n{}\n{}".format(i + 1, time_text, line['text'])) +class SRTFormatter(_TextBasedFormatter): + def _format_timestamp(self, hours, mins, secs, ms): + return "{:02d}:{:02d}:{:02d},{:03d}".format(hours, mins, secs, ms) + + def _format_transcript_header(self, lines): return "\n\n".join(lines) + "\n" - def format_transcripts(self, transcripts, **kwargs): - """Converts a list of transcripts into SRT formatting. + def _format_transcript_helper(self, i, time_text, line): + return "{}\n{}\n{}".format(i + 1, time_text, line['text']) - :param transcript: - :reference: https://www.3playmedia.com/blog/create-srt-file/ - """ - return '\n\n\n'.join([self.format_transcript(transcript, **kwargs) for transcript in transcripts]) + +class WebVTTFormatter(_TextBasedFormatter): + def _format_timestamp(self, hours, mins, secs, ms): + return "{:02d}:{:02d}:{:02d}.{:03d}".format(hours, mins, secs, ms) + + def _format_transcript_header(self, lines): + return "WEBVTT\n\n" + "\n\n".join(lines) + "\n" + + def _format_transcript_helper(self, i, time_text, line): + return "{}\n{}".format(time_text, line['text']) class FormatterLoader(object): From a78f49338665b8c258e92c731d1119a654f072f1 Mon Sep 17 00:00:00 2001 From: Liam Sy Date: Thu, 20 Oct 2022 17:56:49 -0400 Subject: [PATCH 7/9] added test cases for SRTFormmater --- .../test/test_formatters.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/youtube_transcript_api/test/test_formatters.py b/youtube_transcript_api/test/test_formatters.py index 748ed02..ce7d869 100644 --- a/youtube_transcript_api/test/test_formatters.py +++ b/youtube_transcript_api/test/test_formatters.py @@ -8,6 +8,7 @@ from youtube_transcript_api.formatters import ( Formatter, JSONFormatter, TextFormatter, + SRTFormatter, WebVTTFormatter, PrettyPrintFormatter, FormatterLoader ) @@ -28,6 +29,25 @@ class TestFormatters(TestCase): with self.assertRaises(NotImplementedError): Formatter().format_transcripts([self.transcript]) + def test_srt_formatter(self): + content = SRTFormatter().format_transcript(self.transcript) + lines = content.split('\n') + + # test starting lines + self.assertEqual(lines[0], "1") + self.assertEqual(lines[1], "00:00:00,000 --> 00:00:01,500") + + # test end lines + self.assertEqual(lines[-2], self.transcript[-1]['text']) + self.assertEqual(lines[-1], "") + + def test_srt_formatter_many(self): + formatter = SRTFormatter() + content = formatter.format_transcripts(self.transcripts) + formatted_single_transcript = formatter.format_transcript(self.transcript) + + self.assertEqual(content, formatted_single_transcript + '\n\n\n' + formatted_single_transcript) + def test_webvtt_formatter_starting(self): content = WebVTTFormatter().format_transcript(self.transcript) lines = content.split('\n') From 88bab9e77caa151ddaa8bd5ff16d11b420c0d9b2 Mon Sep 17 00:00:00 2001 From: Liam Sy Date: Tue, 25 Oct 2022 10:58:44 -0400 Subject: [PATCH 8/9] added more tests for SRTFormatter --- youtube_transcript_api/test/test_formatters.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/youtube_transcript_api/test/test_formatters.py b/youtube_transcript_api/test/test_formatters.py index ce7d869..321d74d 100644 --- a/youtube_transcript_api/test/test_formatters.py +++ b/youtube_transcript_api/test/test_formatters.py @@ -29,13 +29,26 @@ class TestFormatters(TestCase): with self.assertRaises(NotImplementedError): Formatter().format_transcripts([self.transcript]) - def test_srt_formatter(self): + def test_srt_formatter_starting(self): content = SRTFormatter().format_transcript(self.transcript) lines = content.split('\n') # test starting lines self.assertEqual(lines[0], "1") self.assertEqual(lines[1], "00:00:00,000 --> 00:00:01,500") + + def test_srt_formatter_middle(self): + content = SRTFormatter().format_transcript(self.transcript) + lines = content.split('\n') + + #test middle lines + self.assertEqual(lines[4], "2") + self.assertEqual(lines[5], "00:00:01,500 --> 00:00:02,500") + self.assertEqual(lines[6], self.transcript[1]['text']) + + def test_srt_formatter_ending(self): + content = SRTFormatter().format_transcript(self.transcript) + lines = content.split('\n') # test end lines self.assertEqual(lines[-2], self.transcript[-1]['text']) From 77f1405ca2d35bbd8ba8e38a77c13cbf97c0be3d Mon Sep 17 00:00:00 2001 From: Liam Sy Date: Tue, 25 Oct 2022 11:08:52 -0400 Subject: [PATCH 9/9] revised test comments --- youtube_transcript_api/test/test_formatters.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_transcript_api/test/test_formatters.py b/youtube_transcript_api/test/test_formatters.py index 321d74d..b0b3ba2 100644 --- a/youtube_transcript_api/test/test_formatters.py +++ b/youtube_transcript_api/test/test_formatters.py @@ -41,7 +41,7 @@ class TestFormatters(TestCase): content = SRTFormatter().format_transcript(self.transcript) lines = content.split('\n') - #test middle lines + # test middle lines self.assertEqual(lines[4], "2") self.assertEqual(lines[5], "00:00:01,500 --> 00:00:02,500") self.assertEqual(lines[6], self.transcript[1]['text']) @@ -50,7 +50,7 @@ class TestFormatters(TestCase): content = SRTFormatter().format_transcript(self.transcript) lines = content.split('\n') - # test end lines + # test ending lines self.assertEqual(lines[-2], self.transcript[-1]['text']) self.assertEqual(lines[-1], "")