
    Ng5H                    r   d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	m
Z
mZmZmZmZmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ  ej        e          ZdgZ e G d d                      Z!ddhZ"h dZ#ddZ$ G d de          Z% G d de          Z&e G d de                      Z'dS )zLoads YouTube transcript.    )annotationsN)Enum)Path)AnyDict	GeneratorListOptionalSequenceUnion)parse_qsurlparse)
ParseError)Document)model_validator)	dataclass)
BaseLoaderz0https://www.googleapis.com/auth/youtube.readonlyc                      e Zd ZU dZ ej                    dz  dz  Zded<    ej                    dz  dz  Zded<    ej                    dz  dz  Z	ded<   ddZ
 ed          edd                        ZddZdS )GoogleApiClienta  Generic Google API Client.

    To use, you should have the ``google_auth_oauthlib,youtube_transcript_api,google``
    python package installed.
    As the google api expects credentials you need to set up a google account and
    register your Service. "https://developers.google.com/docs/api/quickstart/python"

    *Security Note*: Note that parsing of the transcripts relies on the standard
        xml library but the input is viewed as trusted in this case.


    Example:
        .. code-block:: python

            from langchain_community.document_loaders import GoogleApiClient
            google_api_client = GoogleApiClient(
                service_account_path=Path("path_to_your_sec_file.json")
            )

    z.credentialszcredentials.jsonr   credentials_pathservice_account_pathz
token.json
token_pathreturnNonec                8    |                                  | _        d S N)_load_credentialscredsselfs    h/var/www/html/ai-engine/env/lib/python3.11/site-packages/langchain_community/document_loaders/youtube.py__post_init__zGoogleApiClient.__post_init__2   s    ++--


    beforemodevaluesDict[str, Any]r   c                x    |                     d          s$|                     d          st          d          |S )DValidate that either folder_id or document_ids is set, but not both.r   r   -Must specify either channel_name or video_idsget
ValueErrorclsr'   s     r!   #validate_channel_or_videoIds_is_setz3GoogleApiClient.validate_channel_or_videoIds_is_set5   sI    
 zz,-- 	Nfjj"7
 7
 	N LMMMr#   c                6   	 ddl m} ddlm} ddlm} ddlm} ddlm	} n# t          $ r t          d          w xY wd}| j                                        r,|j                            t          | j                            S | j                                        r-|                    t          | j                  t"                    }|r|j        s|r,|j        r%|j        r|                     |                       nC|                    t          | j                  t"                    }|                    d	          }t3          | j        d
          5 }|                    |                                           ddd           n# 1 swxY w Y   |S )zLoad credentials.r   )Request)service_account)Credentials)InstalledAppFlowYouTubeTranscriptApiYou must run`pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib youtube-transcript-api` to use the Google Drive loaderN)portw)google.auth.transport.requestsr3   google.oauth2r4   google.oauth2.credentialsr5   google_auth_oauthlib.flowr6   youtube_transcript_apir8   ImportErrorr   existsfrom_service_account_filestrr   from_authorized_user_fileSCOPESvalidexpiredrefresh_tokenrefreshfrom_client_secrets_filer   run_local_serveropenwriteto_json)	r    r3   r4   r5   r6   r8   r   flowtokens	            r!   r   z!GoogleApiClient._load_credentials@   s   	>>>>>>555555======BBBBBBCCCCCCC 	 	 	1  	 $++-- 	".HHD-..   ?!!## 	X99#do:N:NPVWWE 		-EK 		- 6 65+> 6ggii(((('@@-..  --1-55dos++ -uEMMOO,,,- - - - - - - - - - - - - - - s   ! ;(FFFNr   r   r'   r(   r   r   )r   r   )__name__
__module____qualname____doc__r   homer   __annotations__r   r   r"   r   classmethodr1   r    r#   r!   r   r      s          * 'TY[[>9<NNNNNN!*~!=@R!RRRRR ty{{^3lBJBBBB. . . . _(###   [ $#& & & & & &r#   r   httphttps>   vid.plusyoutu.beyoutube.comm.youtube.comwww.youtube.comwww.youtube-nocookie.comurlrD   r   Optional[str]c                   t          |           }|j        t          vrdS |j        t          vrdS |j        }|                    d          rD|j        }t          |          }d|v r(|d         }t          |t                    r|n|d         }n7dS |j                            d          }|                    d          d         }t          |          dk    rdS |S )zEParse a YouTube URL and return the video ID if valid, otherwise None.Nz/watchvr   /   )r   schemeALLOWED_SCHEMESnetlocALLOWED_NETLOCSpathendswithqueryr   
isinstancerD   lstripsplitlen)rd   
parsed_urlro   rq   parsed_queryidsvideo_ids          r!   _parse_video_idrz   t   s    #J//t//t?D}}X 
' ,s#C(c22>ssAHH4%%c**::c??2&
8}}tOr#   c                      e Zd ZdZdZdZdZdS )TranscriptFormatz3Output formats of transcripts from `YoutubeLoader`.textlineschunksN)rT   rU   rV   rW   TEXTLINESCHUNKSr[   r#   r!   r|   r|      s#        ==DEFFFr#   r|   c                      e Zd ZdZdddej        ddfd&dZed'd            Ze	d(d            Z
d)dZd*d!Zd+d#Zd,d%ZdS )-YoutubeLoaderz!Load `YouTube` video transcripts.FenNx   ry   rD   add_video_infoboollanguageUnion[str, Sequence[str]]translationre   transcript_formatr|   continue_on_failurechunk_size_secondsintc                    || _         d|i| _        || _        || _        t	          |t
                    r	|g| _        n|| _        || _        || _        || _        || _	        dS )z!Initialize with YouTube video ID.sourceN)
ry   	_metadatar   r   rr   rD   r   r   r   r   )r    ry   r   r   r   r   r   r   s           r!   __init__zYoutubeLoader.__init__   sp     !"H-, h$$ 	%%JDMM$DM&!2#6 "4r#   youtube_urlr   c                N    t          |           }|st          d|  d          |S )z*Extract video ID from common YouTube URLs.z.Could not determine the video ID for the URL "z".)rz   r.   )r   ry   s     r!   extract_video_idzYoutubeLoader.extract_video_id   s@     #;// 	PPPP   r#   kwargsr   c                >    |                      |          } | |fi |S )z|Given a YouTube URL, construct a loader.
        See `YoutubeLoader()` constructor for a list of keyword arguments.
        )r   )r0   r   r   ry   s       r!   from_youtube_urlzYoutubeLoader.from_youtube_url   s/    
 ''44s8&&v&&&r#   chunk_pieces
List[Dict]chunk_start_secondsr   c                    t          |d          \  }}t          |d          \  }}t          d                    t          d |                    i | j        ||dd|dd|dd| j         d| dd	
          S )z0Create Document from chunk of transcript pieces.<    c                8    | d                              d          S Nr}   r   strip)chunk_pieces    r!   <lambda>z4YoutubeLoader._make_chunk_document.<locals>.<lambda>   s    F(;(A(A#(F(F r#   02d: https://www.youtube.com/watch?v=z&t=s)start_secondsstart_timestampr   page_contentmetadata)divmodr   joinmapr   ry   )r    r   r   mr   hs         r!   _make_chunk_documentz"YoutubeLoader._make_chunk_document   s     )2..1a}}1FFUU .!4&'#=#=#=a#=#=#=a#=#=#=-4= - -)- - -  	
 
 
 	
r#   transcript_piecesGenerator[Document, None, None]c              #  ,  K   g }d}| j         }|D ]V}|d         |d         z   }||k    r(|r|                     ||          V  g }|}|| j         z  }|                    |           Wt          |          dk    r|                     ||          V  d S d S )Nr   startduration)r   r   appendru   )r    r   r   r   chunk_time_limittranscript_piece	piece_ends          r!   _get_transcript_chunksz$YoutubeLoader._get_transcript_chunks   s       .02 1 		2 		2(14DZ4PPI+++ W33LBUVVVVV!&6# D$;;  01111|q  ++L:MNNNNNNN ! r#   List[Document]c                   	 ddl m}m}m} n# t          $ r t	          d          w xY w| j        r.|                                 }| j                            |           	 |	                    | j
                  }n# |$ r g cY S w xY w	 |                    | j                  }n!# |$ r |                    dg          }Y nw xY w| j        |                    | j                  }|                                }| j        t"          j        k    r;d                    t)          d |                    }t+          || j                  gS | j        t"          j        k    rt/          t)          d	 |                    S | j        t"          j        k    r"t/          |                     |                    S t5          d
          )z1Load YouTube transcripts into `Document` objects.r   )NoTranscriptFoundTranscriptsDisabledr8   zvCould not import "youtube_transcript_api" Python package. Please install it with `pip install youtube-transcript-api`.r   Nr   c                8    | d                              d          S r   r   r   s    r!   r   z$YoutubeLoader.load.<locals>.<lambda>  s    -=f-E-K-KC-P-P r#   r   c           
         t          | d                             d          t          t          d |                                                               S )Nr}   r   c                    | d         dk    S )Nr   r}   r[   )items    r!   r   z6YoutubeLoader.load.<locals>.<lambda>.<locals>.<lambda>"  s    T!W-> r#   r   )r   r   dictfilteritemsr   s    r!   r   z$YoutubeLoader.load.<locals>.<lambda>  sZ    X%5f%=%C%CC%H%H!%" > >@P@V@V@X@X " ". . . r#   zUnknown transcript format.)r@   r   r   r8   rA   r   _get_video_infor   updatelist_transcriptsry   find_transcriptr   r   	translatefetchr   r|   r   r   r   r   r   listr   r   r.   )r    r   r   r8   
video_infotranscript_list
transcriptr   s           r!   loadzYoutubeLoader.load   sH   
	          
  	 	 	O  	  	. --//JN!!*---	2CCDMRROO" 	 	 	III		A(88GGJJ  	A 	A 	A(88$@@JJJ	A '#--d.>??J2<2B2B2D2D!%5%:::PP%  J *t~NNNOO#'7'===  &
 
   #'7'>>>334EFFGGG 9:::s,   
 ' A; ;BB	B$ $CCr   c                    	 ddl m} n# t          $ r t          d          w xY w |d| j                   }|j        pd|j        pd|j        pd|j        pd|j        r|j        	                    d          nd|j
        pd|j        pdd}|S )zGet important video information.

        Components include:
            - title
            - description
            - thumbnail URL,
            - publish_date
            - channel author
            - and more.
        r   )YouTubezVCould not import "pytube" Python package. Please install it with `pip install pytube`.r   Unknownz%Y-%m-%d %H:%M:%S)titledescription
view_countthumbnail_urlpublish_datelengthauthor)pytuber   rA   ry   r   r   viewsr   r   strftimer   r   )r    r   ytr   s       r!   r   zYoutubeLoader._get_video_info/  s    	&&&&&&& 	 	 	?  	
 WGGGHHX*>6Y(-a-:BO445HIIIin1i,9

 


 s   	 #)ry   rD   r   r   r   r   r   re   r   r|   r   r   r   r   )r   rD   r   rD   )r   rD   r   r   r   r   )r   r   r   r   r   r   )r   r   r   r   r   r   )r   r   )rT   rU   rV   rW   r|   r   r   staticmethodr   rZ   r   r   r   r   r   r[   r#   r!   r   r      s        ++
  %.2%).>.C$)"%5 5 5 5 50    \ ' ' ' ['
 
 
 
*O O O O(=; =; =; =;~     r#   r   c                      e Zd ZU dZded<   dZded<   dZded<   d	Zd
ed<   dZded<   dZ	d
ed<   d(dZ
d)dZ ed          ed*d                        Zd+dZd,d Zd-d!Zd.d#Zd/d&Zd0d'ZdS )1GoogleApiYoutubeLoadera  Load all Videos from a `YouTube` Channel.

    To use, you should have the ``googleapiclient,youtube_transcript_api``
    python package installed.
    As the service needs a google_api_client, you first have to initialize
    the GoogleApiClient.

    Additionally you have to either provide a channel name or a list of videoids
    "https://developers.google.com/docs/api/quickstart/python"



    Example:
        .. code-block:: python

            from langchain_community.document_loaders import GoogleApiClient
            from langchain_community.document_loaders import GoogleApiYoutubeLoader
            google_api_client = GoogleApiClient(
                service_account_path=Path("path_to_your_sec_file.json")
            )
            loader = GoogleApiYoutubeLoader(
                google_api_client=google_api_client,
                channel_name = "CodeAesthetic"
            )
            load.load()

    r   google_api_clientNre   channel_namezOptional[List[str]]	video_idsTr   r   r   rD   captions_languageFr   r   r   c                N    |                      | j        j                  | _        d S r   )_build_youtube_clientr   r   youtube_clientr   s    r!   r"   z$GoogleApiYoutubeLoader.__post_init__v  s$    "889O9UVVr#   r   r   c                t    	 ddl m} ddlm} n# t          $ r t	          d          w xY w |dd|          S )Nr   )buildr7   r9   youtubev3)credentials)googleapiclient.discoveryr   r@   r8   rA   )r    r   r   r8   s       r!   r   z,GoogleApiYoutubeLoader._build_youtube_clienty  sy    	777777CCCCCCC 	 	 	1  	 uY%8888s    )r$   r%   r'   r(   c                x    |                     d          s$|                     d          st          d          |S )r*   r   r   r+   r,   r/   s     r!   r1   z:GoogleApiYoutubeLoader.validate_channel_or_videoIds_is_set  sA     zz.)) 	N&**[2I2I 	NLMMMr#   ry   c                0   ddl m}m} |                    |          }	 |                    | j        g          }n*# |$ r" |D ]}|                    | j                  }Y nw xY w|                                }d                    d |D                       S )Nr   )r   r8   r   c                D    g | ]}|d                               d          S )r}   r   r   ).0ts     r!   
<listcomp>zGGoogleApiYoutubeLoader._get_transcripe_for_video_id.<locals>.<listcomp>  s(    III!6--IIIr#   )	r@   r   r8   r   r   r   r   r   r   )r    ry   r   r8   r   r   available_transcriptr   s           r!   _get_transcripe_for_video_idz3GoogleApiYoutubeLoader._get_transcripe_for_video_id  s    RRRRRRRR.??II	(88$:P9QRRJJ  	 	 	(7  $1;;D<RSS
 	
 ',,..xxII7HIIIJJJs   ; $A"!A"r   r   c                    |                      |          }| j                                                            d|                                          }t          ||                    d          d                   S )N
id,snippetpartidr   r   r   )r   r   videosr   executer   r-   )r    ry   r   captionsvideo_responses        r!   _get_document_for_video_idz1GoogleApiYoutubeLoader._get_document_for_video_id  s    44X>>&&((T!    WYY 	 !#''003
 
 
 	
r#   c                    | j                                                             d|dd          }|                                }|d         d         d         d         }|S )Nr  channel   )r   qtype
maxResultsr   r   	channelId)r   searchr   r  )r    r   requestresponse
channel_ids        r!   _get_channel_idz&GoogleApiYoutubeLoader._get_channel_id  sg    %,,..33	 4 
 
 ??$$g&q)$/<
r#   r  c                    | j                                                             d|          }|                                }|d         d         d         d         d         S )NcontentDetailsr   r   r   relatedPlaylistsuploads)r   channelsr   r  )r    r  r  r  s       r!   _get_uploads_playlist_idz/GoogleApiYoutubeLoader._get_uploads_playlist_id  sc    %..0055! 6 
 
 ??$$ #$456HI)TTr#   r  r   c           	     r   	 ddl m}m} n# t          $ r t          d          w xY w|                     |          }|                     |          }| j                                                            d|d          }g }|1|	                                }	|	d         D ]}
|
d         d	         d
         }d
|i}| j
        r6|
d                             d           |                    |
d                    	 |                     |          }|                    t          ||                     # ||t           f$ rC}| j        r0t$                              dd|
d         d
          d| z              n|Y d }~d }~ww xY w| j                                                            ||	          }|1|S )Nr   )r   r   zTYou must run`pip install --upgrade youtube-transcript-api` to use the youtube loaderr   2   )r   
playlistIdr  r   snippet
resourceIdvideoId
thumbnailsr   zError fetching transscript r   r  z, exception: )r@   r   r   rA   r  r  r   playlistItemsr   r  r   popr   r   r   r   r   r   loggererrorr  	list_next)r    r  r   r   r   r  uploads_playlist_idr  r   r  r   ry   	meta_datar   es                  r!   _get_document_for_channelz0GoogleApiYoutubeLoader._get_document_for_channel  sV   	          	 	 	,  	 ))'22
";;JGG%3355::* ; 
 

 	!((H !)  	?<8C&1	& 6O''555$$T)_555#'#D#DX#N#NL$$ )5%.      ,->
K   /  9I$t*Y"7IIaIIJ   
  DDDD )0022<<WhOOG7 !: s     %:9D44F9E??Fc                     g } j         r.|                                          j                              n= j        r'|                     fd j        D                        nt	          d          |S )zLoad documents.c                :    g | ]}                     |          S r[   )r  )r   ry   r    s     r!   r   z/GoogleApiYoutubeLoader.load.<locals>.<listcomp>  s7         33H==  r#   r+   )r   extendr(  r   r.   )r    document_lists   ` r!   r   zGoogleApiYoutubeLoader.load  s     
	N  !?!?@Q!R!RSSSS^ 	N     $(N      LMMMr#   rR   )r   r   r   r   rS   )ry   rD   r   rD   )ry   rD   r   r   r   r   )r   rD   r   rD   )r  rD   r   rD   )r  rD   r   r   r   r   r   )rT   rU   rV   rW   rY   r   r   r   r   r   r"   r   r   rZ   r1   r   r  r  r  r(  r   r[   r#   r!   r   r   Q  sj         8 '&&&"&L&&&&%)I))))N!!!!! %%%%%W W W W9 9 9 9  _(###   [ $#K K K K
 
 
 
	 	 	 	U U U U3 3 3 3j     r#   r   )rd   rD   r   re   )(rW   
__future__r   loggingenumr   pathlibr   typingr   r   r   r	   r
   r   r   urllib.parser   r   xml.etree.ElementTreer   langchain_core.documentsr   pydanticr   pydantic.dataclassesr   )langchain_community.document_loaders.baser   	getLoggerrT   r"  rF   r   rl   rn   rz   r|   r   r   r[   r#   r!   <module>r9     s!     " " " " " "              H H H H H H H H H H H H H H H H H H + + + + + + + + , , , , , , - - - - - - $ $ $ $ $ $ * * * * * * @ @ @ @ @ @		8	$	$
<	= N N N N N N N Nb 7#     <    t   t t t t tJ t t tn r r r r rZ r r r r rr#   