fixed bug; added doctstrings for public methods
This commit is contained in:
parent
df417be915
commit
c2c49c3c17
|
@ -1,3 +1,3 @@
|
|||
from ._api import YouTubeTranscriptApi
|
||||
from ._transcripts import TranscriptDataFetcher, TranscriptData, Transcript
|
||||
from ._transcripts import TranscriptList, Transcript
|
||||
from ._errors import TranscriptsDisabled, NoTranscriptFound, CouldNotRetrieveTranscript, VideoUnavailable
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
import requests
|
||||
|
||||
from ._transcripts import TranscriptDataFetcher
|
||||
from ._transcripts import TranscriptListFetcher
|
||||
|
||||
|
||||
class YouTubeTranscriptApi():
|
||||
|
@ -13,8 +13,7 @@ class YouTubeTranscriptApi():
|
|||
:type video_ids: [str]
|
||||
:param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en']
|
||||
it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to
|
||||
do so. As I can't provide a complete list of all working language codes with full certainty, you may have to
|
||||
play around with the language codes a bit, to find the one which is working for you!
|
||||
do so.
|
||||
:type languages: [str]
|
||||
:param continue_after_error: if this is set the execution won't be stopped, if an error occurs while retrieving
|
||||
one of the video transcripts
|
||||
|
@ -23,7 +22,7 @@ class YouTubeTranscriptApi():
|
|||
:type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
|
||||
:return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of
|
||||
video ids, which could not be retrieved
|
||||
:rtype: ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}
|
||||
:rtype: ({str: [{'text': str, 'start': float, 'end': float}]}, [str]})
|
||||
"""
|
||||
data = {}
|
||||
unretrievable_videos = []
|
||||
|
@ -48,8 +47,7 @@ class YouTubeTranscriptApi():
|
|||
:type video_id: str
|
||||
:param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en']
|
||||
it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to
|
||||
do so. As I can't provide a complete list of all working language codes with full certainty, you may have to
|
||||
play around with the language codes a bit, to find the one which is working for you!
|
||||
do so.
|
||||
:type languages: [str]
|
||||
:param proxies: a dictionary mapping of http and https proxies to be used for the network requests
|
||||
:type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
|
||||
|
@ -58,4 +56,4 @@ class YouTubeTranscriptApi():
|
|||
"""
|
||||
with requests.Session() as http_client:
|
||||
http_client.proxies = proxies if proxies else {}
|
||||
return TranscriptDataFetcher(http_client).fetch(video_id).find_transcript(languages).fetch()
|
||||
return TranscriptListFetcher(http_client).fetch(video_id).find_transcript(languages).fetch()
|
||||
|
|
|
@ -16,12 +16,12 @@ from ._errors import VideoUnavailable, NoTranscriptFound, TranscriptsDisabled
|
|||
from ._settings import WATCH_URL
|
||||
|
||||
|
||||
class TranscriptDataFetcher():
|
||||
class TranscriptListFetcher():
|
||||
def __init__(self, http_client):
|
||||
self._http_client = http_client
|
||||
|
||||
def fetch(self, video_id):
|
||||
return TranscriptData.build(
|
||||
return TranscriptList.build(
|
||||
self._http_client,
|
||||
video_id,
|
||||
self._extract_captions_json(self._fetch_html(video_id), video_id)
|
||||
|
@ -48,48 +48,89 @@ class TranscriptDataFetcher():
|
|||
)
|
||||
|
||||
|
||||
class TranscriptData():
|
||||
class TranscriptList():
|
||||
"""
|
||||
This object represents a list of transcripts. It can be iterated over to list all transcripts which are available
|
||||
for a given YouTube video. Also it provides functionality to search for a transcript in a given language.
|
||||
"""
|
||||
|
||||
# TODO implement iterator
|
||||
|
||||
def __init__(
|
||||
self, http_client, video_id, manually_created_transcripts, generated_transcripts, translation_languages
|
||||
):
|
||||
self._http_client = http_client
|
||||
def __init__(self, video_id, manually_created_transcripts, generated_transcripts):
|
||||
"""
|
||||
The constructor is only for internal use. Use the static build method instead.
|
||||
|
||||
:param video_id: the id of the video this TranscriptList is for
|
||||
:type video_id: str
|
||||
:param manually_created_transcripts: dict mapping language codes to the manually created transcripts
|
||||
:type manually_created_transcripts: dict[str, Transcript]
|
||||
:param generated_transcripts: dict mapping language codes to the generated transcripts
|
||||
:type generated_transcripts: dict[str, Transcript]
|
||||
"""
|
||||
self.video_id = video_id
|
||||
self._manually_created_transcripts = manually_created_transcripts
|
||||
self._generated_transcripts = generated_transcripts
|
||||
self._translation_languages = translation_languages
|
||||
|
||||
@staticmethod
|
||||
def build(http_client, video_id, captions_json):
|
||||
manually_created_transcripts = []
|
||||
generated_transcripts = []
|
||||
"""
|
||||
Factory method for TranscriptList.
|
||||
|
||||
:param http_client: http client which is used to make the transcript retrieving http calls
|
||||
:type http_client: requests.Session
|
||||
:param video_id: the id of the video this TranscriptList is for
|
||||
:type video_id: str
|
||||
:param captions_json: the JSON parsed from the YouTube pages static HTML
|
||||
:type captions_json: dict
|
||||
:return: the created TranscriptList
|
||||
:rtype TranscriptList
|
||||
"""
|
||||
translation_languages = [
|
||||
{
|
||||
'language': translation_language['languageName']['simpleText'],
|
||||
'language_code': translation_language['languageCode'],
|
||||
} for translation_language in captions_json['translationLanguages']
|
||||
]
|
||||
|
||||
manually_created_transcripts = {}
|
||||
generated_transcripts = {}
|
||||
|
||||
for caption in captions_json['captionTracks']:
|
||||
(generated_transcripts if caption.get('kind', '') == 'asr' else generated_transcripts).append(
|
||||
{
|
||||
'url': caption['baseUrl'],
|
||||
'language': caption['name']['simpleText'],
|
||||
'language_code': caption['languageCode'],
|
||||
'is_generated': caption.get('kind', '') == 'asr',
|
||||
'is_translatable': caption['isTranslatable'],
|
||||
}
|
||||
if caption.get('kind', '') == 'asr':
|
||||
transcript_dict = generated_transcripts
|
||||
else:
|
||||
transcript_dict = manually_created_transcripts
|
||||
|
||||
transcript_dict[caption['languageCode']] = Transcript(
|
||||
http_client,
|
||||
video_id,
|
||||
caption['baseUrl'],
|
||||
caption['name']['simpleText'],
|
||||
caption['languageCode'],
|
||||
caption.get('kind', '') == 'asr',
|
||||
translation_languages if caption['isTranslatable'] else []
|
||||
)
|
||||
|
||||
return TranscriptData(
|
||||
http_client,
|
||||
return TranscriptList(
|
||||
video_id,
|
||||
manually_created_transcripts,
|
||||
generated_transcripts,
|
||||
[
|
||||
{
|
||||
'language': translation_language['languageName']['simpleText'],
|
||||
'language_code': translation_language['languageCode'],
|
||||
} for translation_language in captions_json['translationLanguages']
|
||||
],
|
||||
)
|
||||
|
||||
def find_transcript(self, language_codes):
|
||||
"""
|
||||
Finds a transcript for a given language code. Manually created transcripts are returned first and only if none
|
||||
are found, generated transcripts are used. If you only want generated transcripts use
|
||||
find_manually_created_transcript instead.
|
||||
|
||||
:param language_codes: A list of language codes in a descending priority. For example, if this is set to
|
||||
['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
|
||||
it fails to do so.
|
||||
:type languages: [str]
|
||||
:return: the found Transcript
|
||||
:rtype: Transcript
|
||||
:raises: NoTranscriptFound
|
||||
"""
|
||||
try:
|
||||
return self.find_manually_created_transcript(language_codes)
|
||||
except NoTranscriptFound:
|
||||
|
@ -98,25 +139,39 @@ class TranscriptData():
|
|||
return self.find_generated_transcript(language_codes)
|
||||
|
||||
def find_generated_transcript(self, language_codes):
|
||||
"""
|
||||
Finds a automatically generated transcript for a given language code.
|
||||
|
||||
:param language_codes: A list of language codes in a descending priority. For example, if this is set to
|
||||
['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
|
||||
it fails to do so.
|
||||
:type languages: [str]
|
||||
:return: the found Transcript
|
||||
:rtype: Transcript
|
||||
:raises: NoTranscriptFound
|
||||
"""
|
||||
return self._find_transcript(language_codes, generated=True)
|
||||
|
||||
def find_manually_created_transcript(self, language_codes):
|
||||
"""
|
||||
Finds a manually created transcript for a given language code.
|
||||
|
||||
:param language_codes: A list of language codes in a descending priority. For example, if this is set to
|
||||
['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
|
||||
it fails to do so.
|
||||
:type languages: [str]
|
||||
:return: the found Transcript
|
||||
:rtype: Transcript
|
||||
:raises: NoTranscriptFound
|
||||
"""
|
||||
return self._find_transcript(language_codes, generated=False)
|
||||
|
||||
def _find_transcript(self, language_codes, generated):
|
||||
transcripts = self._generated_transcripts if generated else self._manually_created_transcripts
|
||||
|
||||
for language_code in language_codes:
|
||||
for transcript in transcripts:
|
||||
if transcript['language_code'] == language_code:
|
||||
return Transcript(
|
||||
self._http_client,
|
||||
transcript['url'],
|
||||
transcript['language'],
|
||||
transcript['language_code'],
|
||||
transcript['is_generated'],
|
||||
self._translation_languages if transcript['is_translatable'] else []
|
||||
)
|
||||
if language_code in transcripts:
|
||||
return transcripts[language_code]
|
||||
|
||||
raise NoTranscriptFound(
|
||||
self.video_id,
|
||||
|
@ -134,34 +189,59 @@ class TranscriptData():
|
|||
).format(
|
||||
video_id=self.video_id,
|
||||
available_manually_created_transcript_languages=self._get_language_description(
|
||||
self._manually_created_transcripts
|
||||
self._manually_created_transcripts.values()
|
||||
),
|
||||
available_generated_transcripts=self._get_language_description(
|
||||
self._generated_transcripts
|
||||
self._generated_transcripts.values()
|
||||
),
|
||||
)
|
||||
|
||||
def _get_language_description(self, transcripts):
|
||||
return '\n'.join(
|
||||
' - {language_code} ("{language}")'.format(
|
||||
language=transcript['language'],
|
||||
language_code=transcript['language_code'],
|
||||
) for transcript in transcripts
|
||||
' - {transcript}'.format(transcript=str(transcript))
|
||||
for transcript in transcripts
|
||||
) if transcripts else 'None'
|
||||
|
||||
|
||||
class Transcript():
|
||||
def __init__(self, http_client, url, language, language_code, is_generated, translation_languages):
|
||||
def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages):
|
||||
"""
|
||||
You probably don't want to initialize this directly. Usually you'll access Transcript objects using a
|
||||
TranscriptList.
|
||||
|
||||
:param http_client: http client which is used to make the transcript retrieving http calls
|
||||
:type http_client: requests.Session
|
||||
:param video_id: the id of the video this TranscriptList is for
|
||||
:type video_id: str
|
||||
:param url: the url which needs to be called to fetch the transcript
|
||||
:param language: the name of the language this transcript uses
|
||||
:param language_code:
|
||||
:param is_generated:
|
||||
:param translation_languages:
|
||||
"""
|
||||
self._http_client = http_client
|
||||
self.url = url
|
||||
self.video_id = video_id
|
||||
self._url = url
|
||||
self.language = language
|
||||
self.language_code = language_code
|
||||
self.is_generated = is_generated
|
||||
self.translation_languages = translation_languages
|
||||
|
||||
def fetch(self):
|
||||
"""
|
||||
Loads the actual transcript data.
|
||||
|
||||
:return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
|
||||
:rtype: [{'text': str, 'start': float, 'end': float}]
|
||||
"""
|
||||
return _TranscriptParser().parse(
|
||||
self._http_client.get(self.url).text
|
||||
self._http_client.get(self._url).text
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
return '{language_code} ("{language}")'.format(
|
||||
language=self.language,
|
||||
language_code=self.language_code,
|
||||
)
|
||||
|
||||
# TODO integrate translations in future release
|
||||
|
|
Loading…
Reference in New Issue