
    Ng!                        d Z ddlZddlZddlmZ ddlmZ ddlm	Z	 ddl
mZmZmZ 	 eZn# e$ r eefZY nw xY w	 ddlmZ n# e$ r	 ddlmZ Y nw xY w	 ddlmZ n# e$ r	 ddlmZ Y nw xY w G d	 d
e          Z	 ddlmZ  G d de          Z e            Zn# e$ r Y nw xY wd ZddZ	 	 ddZ	 	 ddZddZ ddZ!d Z" e            Z#dS )z?
An interface to html5lib that mimics the lxml.html interface.
    N)
HTMLParser)TreeBuilder)etree)ElementXHTML_NAMESPACE_contains_block_level_tag)urlopen)urlparsec                       e Zd ZdZddZdS )r   z*An html5lib HTML parser with lxml as tree.Fc                 :    t          j        | f|t          d| d S N)stricttree)_HTMLParser__init__r   selfr   kwargss      Q/var/www/html/ai-engine/env/lib/python3.11/site-packages/lxml/html/html5parser.pyr   zHTMLParser.__init__   s(    TM&{MMfMMMMM    NF__name__
__module____qualname____doc__r    r   r   r   r      s4        44N N N N N Nr   r   )XHTMLParserc                       e Zd ZdZddZdS )r   z+An html5lib XHTML Parser with lxml as tree.Fc                 :    t          j        | f|t          d| d S r   )_XHTMLParserr   r   r   s      r   r   zXHTMLParser.__init__*   s(    !$RvKRR6RRRRRr   Nr   r   r   r   r   r   r   '   s4        99	S 	S 	S 	S 	S 	Sr   r   c                 t    |                      |          }||S |                      dt          d|          S )N{})findr   )r   tagelems      r   	_find_tagr(   0   s;    99S>>D999##6777r   c                     t          | t                    st          d          |t          }i }|t          | t                    rd}|||d<    |j        | fi |                                S )z
    Parse a whole document into a string.

    If `guess_charset` is true, or if the input is not Unicode but a
    byte string, the `chardet` library will perform charset guessing
    on the string.
    string requiredNT
useChardet)
isinstance_strings	TypeErrorhtml_parserbytesparsegetroot)htmlguess_charsetparseroptionss       r   document_fromstringr7   7   s     dH%% +)***~GD%!8!8   -6<((((00222r   Fc                 t   t          | t                    st          d          |t          }i }|t          | t                    rd}|||d<    |j        | dfi |}|rWt          |d         t                    r<|r:|d                                         rt          j        d|d         z            |d= |S )a`  Parses several HTML elements, returning a list of elements.

    The first item in the list may be a string.  If no_leading_text is true,
    then it will be an error if there is leading text, and it will always be
    a list of only elements.

    If `guess_charset` is true, the `chardet` library will perform charset
    guessing on the string.
    r*   NFr+   divr   zThere is leading text: %r)	r,   r-   r.   r/   r0   parseFragmentstripr   ParserError)r3   no_leading_textr4   r5   r6   childrens         r   fragments_fromstringr?   O   s     dH%% +)***~GD%!8!8   -#v#D%;;7;;H Jx{H55  	{  "" 5'(C(0)4 5 5 5Or   c                 |   t          | t                    st          d          t          |          }t	          | |||           }|rjt          |t                    sd}t          |          }|r@t          |d         t                    r|d         |_        |d= |                    |           |S |st          j	        d          t          |          dk    rt          j	        d          |d         }|j        r5|j                                        rt          j	        d|j        z            d	|_        |S )
a  Parses a single HTML element; it is an error if there is more than
    one element, or if anything but whitespace precedes or follows the
    element.

    If 'create_parent' is true (or is a tag name) then a parent node
    will be created to encapsulate the HTML in a single element.  In
    this case, leading or trailing text is allowed.

    If `guess_charset` is true, the `chardet` library will perform charset
    guessing on the string.
    r*   )r4   r5   r=   r9   r   zNo elements found   zMultiple elements foundzElement followed by text: %rN)r,   r-   r.   boolr?   r   textextendr   r<   lentailr;   )r3   create_parentr4   r5   accept_leading_textelementsnew_rootresults           r   fragment_fromstringrL   q   sU    dH%% +)***}--#M&//1 1 1H  	-22 	"!M=)) 	&(1+x00   (QKOOH%%% 5 3444
8}}q 9:::a[F{ Nv{((** N > LMMMFKMr   c                    t          | t                    st          d          t          | ||          }| dd         }t          |t                    r|                    dd          }|                                                                }|                    d          s|                    d          r|S t          |d	          }t          |          r|S t          |d
          }t          |          dk    rT|j        r|j                                        s4|d         j        r|d         j                                        s|d         S t          |          rd|_        nd|_        |S )a  Parse the html, returning a single element/document.

    This tries to minimally parse the chunk of text, without knowing if it
    is a fragment or a document.

    'base_url' will set the document's base_url attribute (and the tree's
    docinfo.URL)

    If `guess_charset` is true, or if the input is not Unicode but a
    byte string, the `chardet` library will perform charset guessing
    on the string.
    r*   )r5   r4   N2   asciireplacez<htmlz	<!doctypeheadbodyrA   r   r9   span)r,   r-   r.   r7   r0   decodelstriplower
startswithr(   rE   rC   r;   rF   r   r&   )r3   r4   r5   docstartrQ   rR   s          r   
fromstringr[      sq    dH%% +)***
d6,9; ; ;C "IE% 1 Wi00LLNN  ""E   E$4$4[$A$A 
S&!!D 4yy 
S&!!D 	D		Q	1B1Bb 	&*2hm&9&9&;&; 	Aw
 !&& Kr   c                     |t           }t          | t                    s| }|d}n7t          |           rt	          |           }|d}nt          | d          }|d}i }|r||d<    |j        |fi |S )a*  Parse a filename, URL, or file-like object into an HTML document
    tree.  Note: this returns a tree, not an element.  Use
    ``parse(...).getroot()`` to get the document root.

    If ``guess_charset`` is true, the ``useChardet`` option is passed into
    html5lib to enable character detection.  This option is on by default
    when parsing from URLs, off by default when parsing from file(-like)
    objects (which tend to return Unicode more often than not), and on by
    default when parsing from a file path (which is read in binary mode).
    NFTrbr+   )r/   r,   r-   _looks_like_urlr	   openr1   )filename_url_or_filer4   r5   fpr6   s        r   r1   r1      s     ~*H55 !! !M	-	.	. !)**  M&--  MG  . -6<&&g&&&r   c                     t          |           d         }|sdS t          j        dk    r#|t          j        v rt          |          dk    rdS dS )Nr   Fwin32rA   T)r
   sysplatformstringascii_lettersrE   )strschemes     r   r^   r^      sU    c]]1F u
,'
!
!f***Fq  utr   )NN)FNN)$r   rd   rf   html5libr   r    html5lib.treebuilders.etree_lxmlr   lxmlr   	lxml.htmlr   r   r   
basestringr-   	NameErrorr0   rh   urllib2r	   ImportErrorurllib.requestr
   urllib.parser   r!   xhtml_parserr(   r7   r?   rL   r[   r1   r^   r/   r   r   r   <module>ru      s    


  . . . . . . 8 8 8 8 8 8       I I I I I I I I I IHH   s|HHH' ' ' '&&&&&&&&'&!!!!!!! & & &%%%%%%%%&N N N N N N N N!444444S S S S Sl S S S ;==LL  	 	 	D	8 8 83 3 3 30 0548   D -237) ) ) )X3 3 3 3l!' !' !' !'H
 
 
 jllsA   + 	77A AAA A)(A);B B"!B"