added public list_transcripts method
This commit is contained in:
parent
8287d1088e
commit
1bc5087575
|
@ -4,17 +4,68 @@ from ._transcripts import TranscriptListFetcher
|
|||
|
||||
|
||||
class YouTubeTranscriptApi():
|
||||
@classmethod
|
||||
def list_transcripts(cls, video_id, proxies=None):
|
||||
"""
|
||||
Retrieves the list of transcripts which are available for a given video. It returns a `TranscriptList` object
|
||||
which is iterable and provides methods to filter the list of transcripts for specific languages. While iterating
|
||||
over the `TranscriptList` the individual transcripts are represented by `Transcript` objects, which provide
|
||||
metadata and can either be fetched by calling `transcript.fetch()` or translated by calling
|
||||
`transcript.translate('en')`. Example::
|
||||
|
||||
# retrieve the available transcripts
|
||||
transcript_list = YouTubeTranscriptApi.get('video_id')
|
||||
|
||||
# iterate over all available transcripts
|
||||
for transcript in transcript_list:
|
||||
# the Transcript object provides metadata properties
|
||||
print(
|
||||
transcript.video_id,
|
||||
transcript.language,
|
||||
transcript.language_code,
|
||||
# whether it has been manually created or generated by YouTube
|
||||
transcript.is_generated,
|
||||
# a list of languages the transcript can be translated to
|
||||
transcript.translation_languages,
|
||||
)
|
||||
|
||||
# fetch the actual transcript data
|
||||
print(transcript.fetch())
|
||||
|
||||
# translating the transcript will return another transcript object
|
||||
print(transcript.translate('en').fetch())
|
||||
|
||||
# you can also directly filter for the language you are looking for, using the transcript list
|
||||
transcript = transcript_list.find_transcript(['de', 'en'])
|
||||
|
||||
# or just filter for manually created transcripts
|
||||
transcript = transcript_list.find_manually_created_transcript(['de', 'en'])
|
||||
|
||||
# or automatically generated ones
|
||||
transcript = transcript_list.find_generated_transcript(['de', 'en'])
|
||||
|
||||
:param video_id: the youtube video id
|
||||
:type video_id: str
|
||||
:param proxies: a dictionary mapping of http and https proxies to be used for the network requests
|
||||
:type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
|
||||
:return: the list of available transcripts
|
||||
:rtype TranscriptList:
|
||||
"""
|
||||
with requests.Session() as http_client:
|
||||
http_client.proxies = proxies if proxies else {}
|
||||
return TranscriptListFetcher(http_client).fetch(video_id)
|
||||
|
||||
@classmethod
|
||||
def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None):
|
||||
"""
|
||||
Retrieves the transcripts for a list of videos.
|
||||
|
||||
:param video_ids: a list of youtube video ids
|
||||
:type video_ids: [str]
|
||||
:type video_ids: list[str]
|
||||
:param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en']
|
||||
it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to
|
||||
do so.
|
||||
:type languages: [str]
|
||||
:type languages: list[str]
|
||||
:param continue_after_error: if this is set the execution won't be stopped, if an error occurs while retrieving
|
||||
one of the video transcripts
|
||||
:type continue_after_error: bool
|
||||
|
@ -22,7 +73,7 @@ class YouTubeTranscriptApi():
|
|||
:type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
|
||||
:return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of
|
||||
video ids, which could not be retrieved
|
||||
:rtype: ({str: [{'text': str, 'start': float, 'end': float}]}, [str]})
|
||||
:rtype ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}):
|
||||
"""
|
||||
data = {}
|
||||
unretrievable_videos = []
|
||||
|
@ -41,19 +92,19 @@ class YouTubeTranscriptApi():
|
|||
@classmethod
|
||||
def get_transcript(cls, video_id, languages=('en',), proxies=None):
|
||||
"""
|
||||
Retrieves the transcript for a single video.
|
||||
Retrieves the transcript for a single video. This is just a shortcut for calling::
|
||||
|
||||
YouTubeTranscriptApi.list_transcripts(video_id, proxies).find_transcript(languages).fetch()
|
||||
|
||||
:param video_id: the youtube video id
|
||||
:type video_id: str
|
||||
:param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en']
|
||||
it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to
|
||||
do so.
|
||||
:type languages: [str]
|
||||
:type languages: list[str]
|
||||
:param proxies: a dictionary mapping of http and https proxies to be used for the network requests
|
||||
:type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
|
||||
:return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
|
||||
:rtype: [{'text': str, 'start': float, 'end': float}]
|
||||
:rtype [{'text': str, 'start': float, 'end': float}]:
|
||||
"""
|
||||
with requests.Session() as http_client:
|
||||
http_client.proxies = proxies if proxies else {}
|
||||
return TranscriptListFetcher(http_client).fetch(video_id).find_transcript(languages).fetch()
|
||||
return cls.list_transcripts(video_id, proxies).find_transcript(languages).fetch()
|
||||
|
|
|
@ -95,7 +95,7 @@ class TranscriptList():
|
|||
:param captions_json: the JSON parsed from the YouTube pages static HTML
|
||||
:type captions_json: dict
|
||||
:return: the created TranscriptList
|
||||
:rtype TranscriptList
|
||||
:rtype TranscriptList:
|
||||
"""
|
||||
translation_languages = [
|
||||
{
|
||||
|
@ -142,9 +142,9 @@ class TranscriptList():
|
|||
:param language_codes: A list of language codes in a descending priority. For example, if this is set to
|
||||
['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
|
||||
it fails to do so.
|
||||
:type languages: [str]
|
||||
:type languages: list[str]
|
||||
:return: the found Transcript
|
||||
:rtype: Transcript
|
||||
:rtype Transcript:
|
||||
:raises: NoTranscriptFound
|
||||
"""
|
||||
return self._find_transcript(language_codes, [self._manually_created_transcripts, self._generated_transcripts])
|
||||
|
@ -156,9 +156,9 @@ class TranscriptList():
|
|||
:param language_codes: A list of language codes in a descending priority. For example, if this is set to
|
||||
['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
|
||||
it fails to do so.
|
||||
:type languages: [str]
|
||||
:type languages: list[str]
|
||||
:return: the found Transcript
|
||||
:rtype: Transcript
|
||||
:rtype Transcript:
|
||||
:raises: NoTranscriptFound
|
||||
"""
|
||||
return self._find_transcript(language_codes, [self._generated_transcripts,])
|
||||
|
@ -170,9 +170,9 @@ class TranscriptList():
|
|||
:param language_codes: A list of language codes in a descending priority. For example, if this is set to
|
||||
['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
|
||||
it fails to do so.
|
||||
:type languages: [str]
|
||||
:type languages: list[str]
|
||||
:return: the found Transcript
|
||||
:rtype: Transcript
|
||||
:rtype Transcript:
|
||||
:raises: NoTranscriptFound
|
||||
"""
|
||||
return self._find_transcript(language_codes, [self._manually_created_transcripts,])
|
||||
|
@ -252,7 +252,7 @@ class Transcript():
|
|||
Loads the actual transcript data.
|
||||
|
||||
:return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
|
||||
:rtype: [{'text': str, 'start': float, 'end': float}]
|
||||
:rtype [{'text': str, 'start': float, 'end': float}]:
|
||||
"""
|
||||
return _TranscriptParser().parse(
|
||||
self._http_client.get(self._url).text
|
||||
|
|
Loading…
Reference in New Issue