fixed bug; added doctstrings for public methods

This commit is contained in:
Jonas Depoix 2019-12-11 11:42:14 +01:00
parent df417be915
commit c2c49c3c17
3 changed files with 131 additions and 53 deletions

View File

@ -1,3 +1,3 @@
from ._api import YouTubeTranscriptApi
from ._transcripts import TranscriptDataFetcher, TranscriptData, Transcript
from ._transcripts import TranscriptList, Transcript
from ._errors import TranscriptsDisabled, NoTranscriptFound, CouldNotRetrieveTranscript, VideoUnavailable

View File

@ -1,6 +1,6 @@
import requests
from ._transcripts import TranscriptDataFetcher
from ._transcripts import TranscriptListFetcher
class YouTubeTranscriptApi():
@ -13,8 +13,7 @@ class YouTubeTranscriptApi():
:type video_ids: [str]
:param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en']
it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to
do so. As I can't provide a complete list of all working language codes with full certainty, you may have to
play around with the language codes a bit, to find the one which is working for you!
do so.
:type languages: [str]
:param continue_after_error: if this is set the execution won't be stopped, if an error occurs while retrieving
one of the video transcripts
@ -23,7 +22,7 @@ class YouTubeTranscriptApi():
:type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
:return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of
video ids, which could not be retrieved
:rtype: ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}
:rtype: ({str: [{'text': str, 'start': float, 'end': float}]}, [str]})
"""
data = {}
unretrievable_videos = []
@ -48,8 +47,7 @@ class YouTubeTranscriptApi():
:type video_id: str
:param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en']
it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to
do so. As I can't provide a complete list of all working language codes with full certainty, you may have to
play around with the language codes a bit, to find the one which is working for you!
do so.
:type languages: [str]
:param proxies: a dictionary mapping of http and https proxies to be used for the network requests
:type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
@ -58,4 +56,4 @@ class YouTubeTranscriptApi():
"""
with requests.Session() as http_client:
http_client.proxies = proxies if proxies else {}
return TranscriptDataFetcher(http_client).fetch(video_id).find_transcript(languages).fetch()
return TranscriptListFetcher(http_client).fetch(video_id).find_transcript(languages).fetch()

View File

@ -16,12 +16,12 @@ from ._errors import VideoUnavailable, NoTranscriptFound, TranscriptsDisabled
from ._settings import WATCH_URL
class TranscriptDataFetcher():
class TranscriptListFetcher():
def __init__(self, http_client):
self._http_client = http_client
def fetch(self, video_id):
return TranscriptData.build(
return TranscriptList.build(
self._http_client,
video_id,
self._extract_captions_json(self._fetch_html(video_id), video_id)
@ -48,48 +48,89 @@ class TranscriptDataFetcher():
)
class TranscriptData():
class TranscriptList():
"""
This object represents a list of transcripts. It can be iterated over to list all transcripts which are available
for a given YouTube video. Also it provides functionality to search for a transcript in a given language.
"""
# TODO implement iterator
def __init__(
self, http_client, video_id, manually_created_transcripts, generated_transcripts, translation_languages
):
self._http_client = http_client
def __init__(self, video_id, manually_created_transcripts, generated_transcripts):
"""
The constructor is only for internal use. Use the static build method instead.
:param video_id: the id of the video this TranscriptList is for
:type video_id: str
:param manually_created_transcripts: dict mapping language codes to the manually created transcripts
:type manually_created_transcripts: dict[str, Transcript]
:param generated_transcripts: dict mapping language codes to the generated transcripts
:type generated_transcripts: dict[str, Transcript]
"""
self.video_id = video_id
self._manually_created_transcripts = manually_created_transcripts
self._generated_transcripts = generated_transcripts
self._translation_languages = translation_languages
@staticmethod
def build(http_client, video_id, captions_json):
manually_created_transcripts = []
generated_transcripts = []
"""
Factory method for TranscriptList.
for caption in captions_json['captionTracks']:
(generated_transcripts if caption.get('kind', '') == 'asr' else generated_transcripts).append(
{
'url': caption['baseUrl'],
'language': caption['name']['simpleText'],
'language_code': caption['languageCode'],
'is_generated': caption.get('kind', '') == 'asr',
'is_translatable': caption['isTranslatable'],
}
)
return TranscriptData(
http_client,
video_id,
manually_created_transcripts,
generated_transcripts,
[
:param http_client: http client which is used to make the transcript retrieving http calls
:type http_client: requests.Session
:param video_id: the id of the video this TranscriptList is for
:type video_id: str
:param captions_json: the JSON parsed from the YouTube pages static HTML
:type captions_json: dict
:return: the created TranscriptList
:rtype TranscriptList
"""
translation_languages = [
{
'language': translation_language['languageName']['simpleText'],
'language_code': translation_language['languageCode'],
} for translation_language in captions_json['translationLanguages']
],
]
manually_created_transcripts = {}
generated_transcripts = {}
for caption in captions_json['captionTracks']:
if caption.get('kind', '') == 'asr':
transcript_dict = generated_transcripts
else:
transcript_dict = manually_created_transcripts
transcript_dict[caption['languageCode']] = Transcript(
http_client,
video_id,
caption['baseUrl'],
caption['name']['simpleText'],
caption['languageCode'],
caption.get('kind', '') == 'asr',
translation_languages if caption['isTranslatable'] else []
)
return TranscriptList(
video_id,
manually_created_transcripts,
generated_transcripts,
)
def find_transcript(self, language_codes):
"""
Finds a transcript for a given language code. Manually created transcripts are returned first and only if none
are found, generated transcripts are used. If you only want generated transcripts use
find_manually_created_transcript instead.
:param language_codes: A list of language codes in a descending priority. For example, if this is set to
['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
it fails to do so.
:type languages: [str]
:return: the found Transcript
:rtype: Transcript
:raises: NoTranscriptFound
"""
try:
return self.find_manually_created_transcript(language_codes)
except NoTranscriptFound:
@ -98,25 +139,39 @@ class TranscriptData():
return self.find_generated_transcript(language_codes)
def find_generated_transcript(self, language_codes):
"""
Finds a automatically generated transcript for a given language code.
:param language_codes: A list of language codes in a descending priority. For example, if this is set to
['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
it fails to do so.
:type languages: [str]
:return: the found Transcript
:rtype: Transcript
:raises: NoTranscriptFound
"""
return self._find_transcript(language_codes, generated=True)
def find_manually_created_transcript(self, language_codes):
"""
Finds a manually created transcript for a given language code.
:param language_codes: A list of language codes in a descending priority. For example, if this is set to
['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
it fails to do so.
:type languages: [str]
:return: the found Transcript
:rtype: Transcript
:raises: NoTranscriptFound
"""
return self._find_transcript(language_codes, generated=False)
def _find_transcript(self, language_codes, generated):
transcripts = self._generated_transcripts if generated else self._manually_created_transcripts
for language_code in language_codes:
for transcript in transcripts:
if transcript['language_code'] == language_code:
return Transcript(
self._http_client,
transcript['url'],
transcript['language'],
transcript['language_code'],
transcript['is_generated'],
self._translation_languages if transcript['is_translatable'] else []
)
if language_code in transcripts:
return transcripts[language_code]
raise NoTranscriptFound(
self.video_id,
@ -134,34 +189,59 @@ class TranscriptData():
).format(
video_id=self.video_id,
available_manually_created_transcript_languages=self._get_language_description(
self._manually_created_transcripts
self._manually_created_transcripts.values()
),
available_generated_transcripts=self._get_language_description(
self._generated_transcripts
self._generated_transcripts.values()
),
)
def _get_language_description(self, transcripts):
return '\n'.join(
' - {language_code} ("{language}")'.format(
language=transcript['language'],
language_code=transcript['language_code'],
) for transcript in transcripts
' - {transcript}'.format(transcript=str(transcript))
for transcript in transcripts
) if transcripts else 'None'
class Transcript():
def __init__(self, http_client, url, language, language_code, is_generated, translation_languages):
def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages):
"""
You probably don't want to initialize this directly. Usually you'll access Transcript objects using a
TranscriptList.
:param http_client: http client which is used to make the transcript retrieving http calls
:type http_client: requests.Session
:param video_id: the id of the video this TranscriptList is for
:type video_id: str
:param url: the url which needs to be called to fetch the transcript
:param language: the name of the language this transcript uses
:param language_code:
:param is_generated:
:param translation_languages:
"""
self._http_client = http_client
self.url = url
self.video_id = video_id
self._url = url
self.language = language
self.language_code = language_code
self.is_generated = is_generated
self.translation_languages = translation_languages
def fetch(self):
"""
Loads the actual transcript data.
:return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
:rtype: [{'text': str, 'start': float, 'end': float}]
"""
return _TranscriptParser().parse(
self._http_client.get(self.url).text
self._http_client.get(self._url).text
)
def __str__(self):
return '{language_code} ("{language}")'.format(
language=self.language,
language_code=self.language_code,
)
# TODO integrate translations in future release