
    NgP0                     b    d dl Z d dlmZmZmZ d dlmZ d dlmZ d dl	m
Z
  G d de          ZdS )    N)IteratorLiteralOptional)
BaseLoader)Document)get_from_envc                       e Zd ZdZdedefdZdedefdZdddddd	ed
ee         dee         de	d         dee         f
dZ
dee         fdZdS )FireCrawlLoadera!
  
    FireCrawlLoader document loader integration

    Setup:
        Install ``firecrawl-py``,``langchain_community`` and set environment variable ``FIRECRAWL_API_KEY``.

        .. code-block:: bash

            pip install -U firecrawl-py langchain_community
            export FIRECRAWL_API_KEY="your-api-key"

    Instantiate:
        .. code-block:: python

            from langchain_community.document_loaders import FireCrawlLoader

            loader = FireCrawlLoader(
                url = "https://firecrawl.dev",
                mode = "crawl"
                # other params = ...
            )

    Lazy load:
        .. code-block:: python

            docs = []
            docs_lazy = loader.lazy_load()

            # async variant:
            # docs_lazy = await loader.alazy_load()

            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python

            Introducing [Smart Crawl!](https://www.firecrawl.dev/smart-crawl)
             Join the waitlist to turn any web
            {'ogUrl': 'https://www.firecrawl.dev/', 'title': 'Home - Firecrawl', 'robots': 'follow, index', 'ogImage': 'https://www.firecrawl.dev/og.png?123', 'ogTitle': 'Firecrawl', 'sitemap': {'lastmod': '2024-08-12T00:28:16.681Z', 'changefreq': 'weekly'}, 'keywords': 'Firecrawl,Markdown,Data,Mendable,Langchain', 'sourceURL': 'https://www.firecrawl.dev/', 'ogSiteName': 'Firecrawl', 'description': 'Firecrawl crawls and converts any website into clean markdown.', 'ogDescription': 'Turn any website into LLM-ready data.', 'pageStatusCode': 200, 'ogLocaleAlternate': []}

    Async load:
        .. code-block:: python

            docs = await loader.aload()
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python

            Introducing [Smart Crawl!](https://www.firecrawl.dev/smart-crawl)
             Join the waitlist to turn any web
            {'ogUrl': 'https://www.firecrawl.dev/', 'title': 'Home - Firecrawl', 'robots': 'follow, index', 'ogImage': 'https://www.firecrawl.dev/og.png?123', 'ogTitle': 'Firecrawl', 'sitemap': {'lastmod': '2024-08-12T00:28:16.681Z', 'changefreq': 'weekly'}, 'keywords': 'Firecrawl,Markdown,Data,Mendable,Langchain', 'sourceURL': 'https://www.firecrawl.dev/', 'ogSiteName': 'Firecrawl', 'description': 'Firecrawl crawls and converts any website into clean markdown.', 'ogDescription': 'Turn any website into LLM-ready data.', 'pageStatusCode': 200, 'ogLocaleAlternate': []}

    paramsreturnc                    d}g d}|D ]}|                     |          rd} n|rt          j        dt                     d|v r|d         du r|d         |d<   |d= d|v r|d         du r|d         |d<   |d= d	|v r|d	         du r|d	         |d
<   |d	= d|v r|d         du r|d         |d<   |d= d|v r<t	          |d         t
                    r|                     |d                   |d<   |d= |S )NF)includesexcludesallowBackwardCrawlingallowExternalContentLinkspageOptionsTBDeprecated parameters detected. See Firecrawl v1 docs for updates.r   includePathsr   excludePathsr   allowBackwardLinksr   allowExternalLinksr   scrapeOptions)getwarningswarnDeprecationWarning
isinstancedictlegacy_scrape_options_adapter)selfr   use_legacy_optionslegacy_keyskeys        j/var/www/html/ai-engine/env/lib/python3.11/site-packages/langchain_community/document_loaders/firecrawl.pylegacy_crawler_options_adapterz.FireCrawlLoader.legacy_crawler_options_adapterC   s   "
 
 
  	 	Czz# %)"  	*MT"   V##*%---3J-?F>*:&V##*%---3J-?F>*:&&&0012d::39:Q3RF/023*f4456$>>39:U3VF/067&&f]3T:: .2.P.P}-/ /F?+ =)    c                    d}dg}d|v rd|d         v r|d         d         dk    s$|d         d         dk    s|d         d         dk    rd}d	|d         v r?|d         d	         r|d         d	         |d
<   n|d                              d	d          |d
<   d|d         v r|d         d         r|d         d         |d<   d|d         v r|d         d         r|d         d         |d
<   |d= g d}|D ]}|                     |          rd} n|r\t          j        dt                     d|v r"|d         du r|                    d           |d= d|v r"|d         du r|                    d           |d= d|v r"|d         du r|                    d           |d= d|v r"|d         du r|                    d           |d= d|v r"|d         du r|                    d           |d= d|v r"|d         du r|                    d           |d= d|v r"|d         du r|                    d           |d= d|v r|d         du r|d         |d<   |d= d|v r|d         du r|d         |d <   |d= d!|vr||d!<   |S )"NFmarkdownextractorOptionsmodezllm-extractionzllm-extraction-from-raw-htmlzllm-extraction-from-markdownTextractionPromptpromptz-Extract page information based on the schema.extractionSchemaschema
userPrompt)	includeMarkdownincludeHtmlincludeRawHtmlincludeExtractincludeLinks
screenshotfullPageScreenshotonlyIncludeTags
removeTagsr   r0   r1   htmlr2   rawHtmlr3   extractr4   linksr5   r6   zscreenshot@fullPager7   includeTagsr8   excludeTagsformats)r   r   r   r   removeappend)r    r   r!   r?   scrape_keysr#   s         r$   r   z-FireCrawlLoader.legacy_scrape_options_adapters   s   ",'' 2333-.v6:JJJ01&956 601&956 6 *.&)V4F-GGG!"456HI /56H/I 20F8,, 066H/I/M/M 2 O0 0F8,
 *V4F-GGG!"456HI /56H/I 20F8, $v.@'AAA!"45lC X/56H/I,/WF8,12

 

 

  	 	Czz# %)"  0	)MT"   !F**+,55NN:...,-&&-(D00NN6***=)6))*+t33NN9---+,6))*+t33NN9---+,''.)T11NN7+++>*v%%,'4//NN<000<(#v--./477NN#8999/0 F**+,44,23D,EF=),-v%%,'4//,2<,@F=)<(F"" 'F9r&   Ncrawl)api_keyapi_urlr*   r   urlrD   rE   r*   )rC   scrapemapc                   	 ddl m} n# t          $ r t          d          w xY w|dvrt          d| d          |st          d          |pt	          dd	          } |||
          | _         || _        || _        |pi | _        dS )aR  Initialize with API key and url.

        Args:
            url: The url to be crawled.
            api_key: The Firecrawl API key. If not specified will be read from env var
                FIRECRAWL_API_KEY. Get an API key
            api_url: The Firecrawl API URL. If not specified will be read from env var
                FIRECRAWL_API_URL or defaults to https://api.firecrawl.dev.
            mode: The mode to run the loader in. Default is "crawl".
                 Options include "scrape" (single url),
                 "crawl" (all accessible sub pages),
                 "map" (returns list of links that are semantically related).
            params: The parameters to pass to the Firecrawl API.
                Examples include crawlerOptions.
                For more details, visit: https://github.com/mendableai/firecrawl-py
        r   )FirecrawlAppzD`firecrawl` package not found, please run `pip install firecrawl-py`)rC   rG   searchrH   Invalid mode 'z/'. Allowed: 'crawl', 'scrape', 'search', 'map'.zUrl must be providedrD   FIRECRAWL_API_KEY)rD   rE   N)	firecrawlrJ   ImportError
ValueErrorr   rF   r*   r   )r    rF   rD   rE   r*   r   rJ   s          r$   __init__zFireCrawlLoader.__init__   s    4	....... 	 	 	V  	 ;;;VVVV    	53444I\)5HII%gwGGG	ls   	 #c              #   n  K   | j         dk    r;| j                            | j        |                     | j                            g}n| j         dk    rf| j        st          d          | j                            | j        |                     | j                            }|	                    dg           }nz| j         dk    r=| j        st          d          | j        
                    | j        | j                  }n2| j         dk    rt          d	          t          d
| j          d          |D ]~}| j         dk    r|}i }nV|	                    d          p*|	                    d          p|	                    dd          }|	                    di           }|skt          ||          V  d S )NrG   )r   rC   zURL is required for crawl modedatarH   zURL is required for map moderK   z?Search mode is not supported in this version, please downgrade.rL   z%'. Allowed: 'crawl', 'scrape', 'map'.r(   r9   r:    metadata)page_contentrU   )r*   rN   
scrape_urlrF   r   r   rP   	crawl_urlr%   r   map_urlr   )r    firecrawl_docscrawl_responsedocrV   rU   s         r$   	lazy_loadzFireCrawlLoader.lazy_load  s     9  ))HT%G%G%T%T *  NN
 Y'!!8 C !ABBB!^55!D!DT[!Q!Q 6  N ,//;;NNY%8 A !?@@@!^33DHT[3QQNNY(""Q   QQQQ   " 	 	CyE!!" GGJ''T3776??TcggiQS>T>T  77:r22 )!      	 	r&   )__name__
__module____qualname____doc__r   r%   r   strr   r   rQ   r   r   r]    r&   r$   r
   r
   	   s        7 7r.T .d . . . .`jD jT j j j j` "&!%29!%,# ,# ,#,# #	,#
 #,# ./,# ,# ,# ,# ,#\(8H- ( ( ( ( ( (r&   r
   )r   typingr   r   r   langchain_core.document_loadersr   langchain_core.documentsr   langchain_core.utilsr   r
   rc   r&   r$   <module>rh      s     . . . . . . . . . . 6 6 6 6 6 6 - - - - - - - - - - - -l l l l lj l l l l lr&   