diff --git a/.gitignore b/.gitignore index 16a69f2..8b2b8b9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,8 @@ .idea .venv +virtualenv *.pyc dist build *.egg-info -upload_new_version.sh \ No newline at end of file +upload_new_version.sh diff --git a/README.md b/README.md index 75c44cd..9718aed 100644 --- a/README.md +++ b/README.md @@ -48,12 +48,22 @@ This will return a list of dictionaries looking somewhat like this: ] ``` +You can also add the `languages` param if you want to make sure the transcripts are retrieved in your desired language (it usually defaults to english). + +```python +YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en']) +``` + +It's a list of language codes in a descending priority. In this example it will first try to fetch the german transcript (`'de'`) and then fetch the english transcipt (`'en'`) if it fails to do so. As I can't provide a complete list of all working language codes with full certainty, you may have to play around with the language codes a bit, to find the one which is working for you! + To get transcripts for a list fo video ids you can call: ```python -YouTubeTranscriptApi.get_transcripts(video_ids) +YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en']) ``` +`languages` also is optional here. + ### CLI Execute the CLI script using the video ids as parameters and the results will be printed out to the command line: @@ -70,4 +80,4 @@ youtube_transcript_api --json ... > transcrip ## Warning -This code uses an undocumented part of the YouTube API, which is called by the YouTube web-client. So there is no guarantee that it won't stop working tomorrow, if they change how things work. I will however do my best to make things working again as soon as possible if that happens. So if it stops working, let me know! \ No newline at end of file +This code uses an undocumented part of the YouTube API, which is called by the YouTube web-client. So there is no guarantee that it won't stop working tomorrow, if they change how things work. I will however do my best to make things working again as soon as possible if that happens. So if it stops working, let me know! diff --git a/setup.py b/setup.py index d955e1a..f13e8e0 100644 --- a/setup.py +++ b/setup.py @@ -8,13 +8,10 @@ def _get_file_content(file_name): def get_long_description(): return _get_file_content('README.md') -def get_requirements(): - return list(filter(lambda line: line != '' and not line.startswith('#'), _get_file_content('requirements.txt').split('\n'))) - setuptools.setup( name="youtube_transcript_api", - version="0.1.1", + version="0.1.2", author="Jonas Depoix", author_email="jonas.depoix@web.de", description="This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. It also works for automatically generated subtitles and it does not require a headless browser, like other selenium based solutions do!", @@ -29,7 +26,9 @@ setuptools.setup( "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", ), - install_requires=get_requirements(), + install_requires=[ + 'requests', + ], entry_points={ 'console_scripts': [ 'youtube_transcript_api = youtube_transcript_api.__main__:main', diff --git a/youtube_transcript_api/__main__.py b/youtube_transcript_api/__main__.py index 36b2688..37bd7bb 100644 --- a/youtube_transcript_api/__main__.py +++ b/youtube_transcript_api/__main__.py @@ -11,7 +11,7 @@ from ._api import YouTubeTranscriptApi def main(): logging.basicConfig() - + if len(sys.argv) <= 1: print('No YouTube video id was found') elif sys.argv[1] == '--json': diff --git a/youtube_transcript_api/_api.py b/youtube_transcript_api/_api.py index 42adeb4..be37b61 100644 --- a/youtube_transcript_api/_api.py +++ b/youtube_transcript_api/_api.py @@ -1,3 +1,9 @@ +import sys + +if sys.version_info.major == 2: + reload(sys) + sys.setdefaultencoding('utf-8') + from xml.etree import ElementTree import re @@ -30,14 +36,18 @@ class YouTubeTranscriptApi(): ) self.video_id = video_id - @staticmethod - def get_transcripts(video_ids, continue_after_error=False): + def get_transcripts(video_ids, languages=None, continue_after_error=False): """ Retrieves the transcripts for a list of videos. :param video_ids: a list of youtube video ids :type video_ids: [str] + :param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en'] + it will first try to fetch the german transcript (de) and then fetch the english transcipt (en) if it fails to + do so. As I can't provide a complete list of all working language codes with full certainty, you may have to + play around with the language codes a bit, to find the one which is working for you! + :type languages: [str] :param continue_after_error: if this is set the execution won't be stopped, if an error occurs while retrieving one of the video transcripts :type continue_after_error: bool @@ -50,7 +60,7 @@ class YouTubeTranscriptApi(): for video_id in video_ids: try: - data[video_id] = YouTubeTranscriptApi.get_transcript(video_id) + data[video_id] = YouTubeTranscriptApi.get_transcript(video_id, languages) except Exception as exception: if not continue_after_error: raise exception @@ -60,17 +70,22 @@ class YouTubeTranscriptApi(): return data, unretrievable_videos @staticmethod - def get_transcript(video_id): + def get_transcript(video_id, languages=None): """ Retrieves the transcript for a single video. :param video_id: the youtube video id :type video_id: str + :param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en'] + it will first try to fetch the german transcript (de) and then fetch the english transcipt (en) if it fails to + do so. As I can't provide a complete list of all working language codes with full certainty, you may have to + play around with the language codes a bit, to find the one which is working for you! + :type languages: [str] :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys :rtype: [{'text': str, 'start': float, 'end': float}] """ try: - return _TranscriptParser(_TranscriptFetcher(video_id).fetch()).parse() + return _TranscriptParser(_TranscriptFetcher(video_id, languages).fetch()).parse() except Exception: logger.error( YouTubeTranscriptApi.CouldNotRetrieveTranscript.ERROR_MESSAGE.format( @@ -83,26 +98,36 @@ class YouTubeTranscriptApi(): class _TranscriptFetcher(): WATCH_URL = 'https://www.youtube.com/watch?v={video_id}' API_BASE_URL = 'https://www.youtube.com/api/{api_url}' + LANGUAGE_REGEX = re.compile(r'(&lang=.*&)|(&lang=.*)') - def __init__(self, video_id): + def __init__(self, video_id, languages): self.video_id = video_id + self.languages = languages def fetch(self): fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id)).text - timedtext_url_start = fetched_site.find('timedtext') - return requests.get( - self.API_BASE_URL.format( - api_url=fetched_site[ - timedtext_url_start:timedtext_url_start + fetched_site[timedtext_url_start:].find('"') - ].replace( - '\\u0026', '&' - ).replace( - '\\', '' - ) + for language in (self.languages if self.languages else [None,]): + response = self._execute_api_request(fetched_site, timedtext_url_start, language) + if response: + return response + + return None + + def _execute_api_request(self, fetched_site, timedtext_url_start, language): + url = self.API_BASE_URL.format( + api_url=fetched_site[ + timedtext_url_start:timedtext_url_start + fetched_site[timedtext_url_start:].find('"') + ].replace( + '\\u0026', '&' + ).replace( + '\\', '' ) - ).text + ) + if language: + url = re.sub(self.LANGUAGE_REGEX, '&lang={language}&'.format(language=language), url) + return requests.get(url).text class _TranscriptParser():