fixed bug; added doctstrings for public methods

This commit is contained in:
Jonas Depoix 2019-12-11 11:42:14 +01:00
parent df417be915
commit c2c49c3c17
3 changed files with 131 additions and 53 deletions

View File

@ -1,3 +1,3 @@
from ._api import YouTubeTranscriptApi from ._api import YouTubeTranscriptApi
from ._transcripts import TranscriptDataFetcher, TranscriptData, Transcript from ._transcripts import TranscriptList, Transcript
from ._errors import TranscriptsDisabled, NoTranscriptFound, CouldNotRetrieveTranscript, VideoUnavailable from ._errors import TranscriptsDisabled, NoTranscriptFound, CouldNotRetrieveTranscript, VideoUnavailable

View File

@ -1,6 +1,6 @@
import requests import requests
from ._transcripts import TranscriptDataFetcher from ._transcripts import TranscriptListFetcher
class YouTubeTranscriptApi(): class YouTubeTranscriptApi():
@ -13,8 +13,7 @@ class YouTubeTranscriptApi():
:type video_ids: [str] :type video_ids: [str]
:param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en'] :param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en']
it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to
do so. As I can't provide a complete list of all working language codes with full certainty, you may have to do so.
play around with the language codes a bit, to find the one which is working for you!
:type languages: [str] :type languages: [str]
:param continue_after_error: if this is set the execution won't be stopped, if an error occurs while retrieving :param continue_after_error: if this is set the execution won't be stopped, if an error occurs while retrieving
one of the video transcripts one of the video transcripts
@ -23,7 +22,7 @@ class YouTubeTranscriptApi():
:type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
:return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of :return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of
video ids, which could not be retrieved video ids, which could not be retrieved
:rtype: ({str: [{'text': str, 'start': float, 'end': float}]}, [str]} :rtype: ({str: [{'text': str, 'start': float, 'end': float}]}, [str]})
""" """
data = {} data = {}
unretrievable_videos = [] unretrievable_videos = []
@ -48,8 +47,7 @@ class YouTubeTranscriptApi():
:type video_id: str :type video_id: str
:param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en'] :param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en']
it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to
do so. As I can't provide a complete list of all working language codes with full certainty, you may have to do so.
play around with the language codes a bit, to find the one which is working for you!
:type languages: [str] :type languages: [str]
:param proxies: a dictionary mapping of http and https proxies to be used for the network requests :param proxies: a dictionary mapping of http and https proxies to be used for the network requests
:type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
@ -58,4 +56,4 @@ class YouTubeTranscriptApi():
""" """
with requests.Session() as http_client: with requests.Session() as http_client:
http_client.proxies = proxies if proxies else {} http_client.proxies = proxies if proxies else {}
return TranscriptDataFetcher(http_client).fetch(video_id).find_transcript(languages).fetch() return TranscriptListFetcher(http_client).fetch(video_id).find_transcript(languages).fetch()

View File

@ -16,12 +16,12 @@ from ._errors import VideoUnavailable, NoTranscriptFound, TranscriptsDisabled
from ._settings import WATCH_URL from ._settings import WATCH_URL
class TranscriptDataFetcher(): class TranscriptListFetcher():
def __init__(self, http_client): def __init__(self, http_client):
self._http_client = http_client self._http_client = http_client
def fetch(self, video_id): def fetch(self, video_id):
return TranscriptData.build( return TranscriptList.build(
self._http_client, self._http_client,
video_id, video_id,
self._extract_captions_json(self._fetch_html(video_id), video_id) self._extract_captions_json(self._fetch_html(video_id), video_id)
@ -48,48 +48,89 @@ class TranscriptDataFetcher():
) )
class TranscriptData(): class TranscriptList():
"""
This object represents a list of transcripts. It can be iterated over to list all transcripts which are available
for a given YouTube video. Also it provides functionality to search for a transcript in a given language.
"""
# TODO implement iterator # TODO implement iterator
def __init__( def __init__(self, video_id, manually_created_transcripts, generated_transcripts):
self, http_client, video_id, manually_created_transcripts, generated_transcripts, translation_languages """
): The constructor is only for internal use. Use the static build method instead.
self._http_client = http_client
:param video_id: the id of the video this TranscriptList is for
:type video_id: str
:param manually_created_transcripts: dict mapping language codes to the manually created transcripts
:type manually_created_transcripts: dict[str, Transcript]
:param generated_transcripts: dict mapping language codes to the generated transcripts
:type generated_transcripts: dict[str, Transcript]
"""
self.video_id = video_id self.video_id = video_id
self._manually_created_transcripts = manually_created_transcripts self._manually_created_transcripts = manually_created_transcripts
self._generated_transcripts = generated_transcripts self._generated_transcripts = generated_transcripts
self._translation_languages = translation_languages
@staticmethod @staticmethod
def build(http_client, video_id, captions_json): def build(http_client, video_id, captions_json):
manually_created_transcripts = [] """
generated_transcripts = [] Factory method for TranscriptList.
:param http_client: http client which is used to make the transcript retrieving http calls
:type http_client: requests.Session
:param video_id: the id of the video this TranscriptList is for
:type video_id: str
:param captions_json: the JSON parsed from the YouTube pages static HTML
:type captions_json: dict
:return: the created TranscriptList
:rtype TranscriptList
"""
translation_languages = [
{
'language': translation_language['languageName']['simpleText'],
'language_code': translation_language['languageCode'],
} for translation_language in captions_json['translationLanguages']
]
manually_created_transcripts = {}
generated_transcripts = {}
for caption in captions_json['captionTracks']: for caption in captions_json['captionTracks']:
(generated_transcripts if caption.get('kind', '') == 'asr' else generated_transcripts).append( if caption.get('kind', '') == 'asr':
{ transcript_dict = generated_transcripts
'url': caption['baseUrl'], else:
'language': caption['name']['simpleText'], transcript_dict = manually_created_transcripts
'language_code': caption['languageCode'],
'is_generated': caption.get('kind', '') == 'asr', transcript_dict[caption['languageCode']] = Transcript(
'is_translatable': caption['isTranslatable'], http_client,
} video_id,
caption['baseUrl'],
caption['name']['simpleText'],
caption['languageCode'],
caption.get('kind', '') == 'asr',
translation_languages if caption['isTranslatable'] else []
) )
return TranscriptData( return TranscriptList(
http_client,
video_id, video_id,
manually_created_transcripts, manually_created_transcripts,
generated_transcripts, generated_transcripts,
[
{
'language': translation_language['languageName']['simpleText'],
'language_code': translation_language['languageCode'],
} for translation_language in captions_json['translationLanguages']
],
) )
def find_transcript(self, language_codes): def find_transcript(self, language_codes):
"""
Finds a transcript for a given language code. Manually created transcripts are returned first and only if none
are found, generated transcripts are used. If you only want generated transcripts use
find_manually_created_transcript instead.
:param language_codes: A list of language codes in a descending priority. For example, if this is set to
['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
it fails to do so.
:type languages: [str]
:return: the found Transcript
:rtype: Transcript
:raises: NoTranscriptFound
"""
try: try:
return self.find_manually_created_transcript(language_codes) return self.find_manually_created_transcript(language_codes)
except NoTranscriptFound: except NoTranscriptFound:
@ -98,25 +139,39 @@ class TranscriptData():
return self.find_generated_transcript(language_codes) return self.find_generated_transcript(language_codes)
def find_generated_transcript(self, language_codes): def find_generated_transcript(self, language_codes):
"""
Finds a automatically generated transcript for a given language code.
:param language_codes: A list of language codes in a descending priority. For example, if this is set to
['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
it fails to do so.
:type languages: [str]
:return: the found Transcript
:rtype: Transcript
:raises: NoTranscriptFound
"""
return self._find_transcript(language_codes, generated=True) return self._find_transcript(language_codes, generated=True)
def find_manually_created_transcript(self, language_codes): def find_manually_created_transcript(self, language_codes):
"""
Finds a manually created transcript for a given language code.
:param language_codes: A list of language codes in a descending priority. For example, if this is set to
['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
it fails to do so.
:type languages: [str]
:return: the found Transcript
:rtype: Transcript
:raises: NoTranscriptFound
"""
return self._find_transcript(language_codes, generated=False) return self._find_transcript(language_codes, generated=False)
def _find_transcript(self, language_codes, generated): def _find_transcript(self, language_codes, generated):
transcripts = self._generated_transcripts if generated else self._manually_created_transcripts transcripts = self._generated_transcripts if generated else self._manually_created_transcripts
for language_code in language_codes: for language_code in language_codes:
for transcript in transcripts: if language_code in transcripts:
if transcript['language_code'] == language_code: return transcripts[language_code]
return Transcript(
self._http_client,
transcript['url'],
transcript['language'],
transcript['language_code'],
transcript['is_generated'],
self._translation_languages if transcript['is_translatable'] else []
)
raise NoTranscriptFound( raise NoTranscriptFound(
self.video_id, self.video_id,
@ -134,34 +189,59 @@ class TranscriptData():
).format( ).format(
video_id=self.video_id, video_id=self.video_id,
available_manually_created_transcript_languages=self._get_language_description( available_manually_created_transcript_languages=self._get_language_description(
self._manually_created_transcripts self._manually_created_transcripts.values()
), ),
available_generated_transcripts=self._get_language_description( available_generated_transcripts=self._get_language_description(
self._generated_transcripts self._generated_transcripts.values()
), ),
) )
def _get_language_description(self, transcripts): def _get_language_description(self, transcripts):
return '\n'.join( return '\n'.join(
' - {language_code} ("{language}")'.format( ' - {transcript}'.format(transcript=str(transcript))
language=transcript['language'], for transcript in transcripts
language_code=transcript['language_code'],
) for transcript in transcripts
) if transcripts else 'None' ) if transcripts else 'None'
class Transcript(): class Transcript():
def __init__(self, http_client, url, language, language_code, is_generated, translation_languages): def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages):
"""
You probably don't want to initialize this directly. Usually you'll access Transcript objects using a
TranscriptList.
:param http_client: http client which is used to make the transcript retrieving http calls
:type http_client: requests.Session
:param video_id: the id of the video this TranscriptList is for
:type video_id: str
:param url: the url which needs to be called to fetch the transcript
:param language: the name of the language this transcript uses
:param language_code:
:param is_generated:
:param translation_languages:
"""
self._http_client = http_client self._http_client = http_client
self.url = url self.video_id = video_id
self._url = url
self.language = language self.language = language
self.language_code = language_code self.language_code = language_code
self.is_generated = is_generated self.is_generated = is_generated
self.translation_languages = translation_languages self.translation_languages = translation_languages
def fetch(self): def fetch(self):
"""
Loads the actual transcript data.
:return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
:rtype: [{'text': str, 'start': float, 'end': float}]
"""
return _TranscriptParser().parse( return _TranscriptParser().parse(
self._http_client.get(self.url).text self._http_client.get(self._url).text
)
def __str__(self):
return '{language_code} ("{language}")'.format(
language=self.language,
language_code=self.language_code,
) )
# TODO integrate translations in future release # TODO integrate translations in future release