
    Ng                       U d Z ddlmZ ddlmZmZ ddlmZ ddlm	Z	m
Z
mZmZmZmZmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZmZmZm Z m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z' ddl(m)Z) ee*e	f         Z+de,d<   	 ddZ-ddZ. G d de          Z/ee/         Z0de,d<   	  G d d          Z1 G d d          Z2 G d de2          Z3 G d d ej4                  Z5 G d! d"e5          Z6 G d# d$e5          Z7 G d% d&e5          Z8 G d' d(e5          Z9 G d) d*e6          Z: G d+ d,e5          Z; G d- d.e5          Z< G d/ d0ej4                  Z= G d1 d2e=          Z> G d3 d4e=          Z? G d5 d6e=          Z@ G d7 d8e=          ZA G d9 d:e=          ZB G d; d<e5e=          ZCdd>ZD ejE        d?@          ZF ejG        eCA          ZH ejI        eH          ZJeFK                    eJ           eJL                    dB          M                    i dCe5dDe5dEe5dFe5dGe5dHe5dIe5dJe5dKe5dLe5dMe5dNe5dOe7dPe7dQe7dRe7dSe7i dTe7dUe6dVe:dWe8dXe8dYe9dZe;d[e>d\e?d]e@d^e@d_e?d`e=dae=dbe=dce=ddeAi dee=dfe=dge=dhe=die=dje=dke=dle=dme=dne=doe=dpe=dqe=dre=dse=dte=due=i dve=dwe=dxe=dyeBdzeBd{e<d|e<d}e<d~e<de<de<de<de<de<de<de<           dBS )a  Provides the HTML parser used by `partition_html()`.

The names "flow" and "phrasing" derive from the language of the HTML Standard.

PRINCIPLES

- _Elements are paragraphs._ Each paragraph in the HTML document should become a distinct element.
  In particular, a paragraph should not be split into two elements and an element should not
  contain more than one paragraph.

- _An empty paragraph is not an Element._ A paragraph which contains no text or contains only
  whitespace does not give rise to an Element (is skipped).

- _The browser rendering is the document._ The HTML "source-code" is not the document. The document
  is the way that HTML is rendered by a browser (Chrome for a first authority). This foundational
  principle gives rise to a few that are more specific.

- _Whitespace is normalized._ Whitespace used for formatting the HTML source is _normalized_ to a
  single space between text segments. More specifically:
  - Any leading or trailing space on a paragraph is removed.
  - All other runs of whitespace in the paragraph are reduced to a single space (" ").
  - Whitespace is never added where none existed in the HTML source.
  - Whitespace within a `<pre>` element is the exception and is not normalized. Its
    whitespace is preserved excepting a leading and/or trailing newline ("
").

- _Block-items are paragraphs._ Visible content in HTML can be divided into _block-items_ and
  _phrasing content_ (aka. _inline content_).
  - As an example, a `<p>` element is a block item and a `<b>` element is phrasing.
  - A block item starts a new paragraph and so represents an Element boundary.
  - A phrasing item affects the appearance of a run of text within a paragraph, like making it
    bold or making it into a link.
  - Some elements can take either role, depending upon their ancestors and descendants.
  - The final authority for whether a particular element is displayed as a block or as inline
    "formatting" is the CSS. We do not attempt to interpret the CSS and assume the default role
    for each element.

Other background

- The parser's design is _recursive_, consistent with the recursive (tree) structure of HTML. The
  nodes of the tree are _HTML elements_. Unfortunately this naming sometimes conflicts with
  Unstructured _document-elements_. In the parser code the term "document-element" is used when
  there may be ambiguity.

- The parser is primarily composed of `lxml` Custom Element Classes. The gist is you write a class
  like `Anchor` and then tell the `lxml` parser that all `<a>` elements should be instantiated
  using the `Anchor` class. We also provide a default class for any elements that we haven't
  called out explicitly.

- _Anatomy of an HTML element._ Some basic terms are important to know to understand the domain
  language of the parser code. Consider this example:
  ```html
  <div>
    <p>Text <b>bold child</b> tail of child</p>
    tail of p
  </div>
  ```
  - An element can have _text_.
    - All visible content within an HTML document is the text (or tail) of some element.
    - The text of the `<p>` element (`p.text`) is "Text ".
    - Note the formatting whitespace is included.
  - An element can have _child elements_.
    - The `<p>` element (`p`) is a child of `div`.
    - `b` is a child of `p`.
  - An element can have a _tail_.
    - Whatever text follows an element, before the next element starts, is the tail of
      that element.
    - `b.tail` is `" tail of child"`. Note the included whitespace.
    - `p.tail` is `"
    tail of p
"`.
    - Tail text is _accessed_ via the element that precedes it but that element does not
      _influence_ its tail text. For example, "tail of child" does not appear in a bold
      typeface even though it is the tail of `b`.
    )annotations)defaultdictdeque)MappingProxyType)AnyIterableIteratorMapping
NamedTupleSequencecast)etree)	TypeAlias)clean_bullets)htmlify_matrix_of_cell_texts)	AddressElementElementMetadataEmailAddressListItemNarrativeTextTableTextTitle)is_bulleted_textis_email_addressis_possible_narrative_textis_possible_titleis_us_city_state_zip)lazypropertyr   
Annotationr   Iterable[Annotation]returnc                   t          t          t          t          t                   f         t          t                              }| D ]}|                                D ]o\  }}t          |t                    r:||                             t          t          t                   |                     T||                             |           pt          t          |                    S )zpCombine individual text-segment annotations into an element-level annotation.

    Sequence is significant.
    )r   r   strlistitems
isinstanceextendr   appendr   dict)r   combined_annotationsakvs        ^/var/www/html/ai-engine/env/lib/python3.11/site-packages/unstructured/partition/html/parser.py_consolidate_annotationsr1   x   s    
  CcN ;[=N=NOO 2 2GGII 	2 	2DAq!T"" 2$Q'..tDIq/A/ABBBB$Q'..q1111		2 D!566777    textr%   c                t    d                     |                                                                           S )a  `text` with normalized whitespace.

    - leading and trailing whitespace are removed
    - all whitespace segments within text (spacing between words) are reduced to a single space
      each.

    Produces the empty string when `text` contains only whitespace.
     )joinstripsplitr3   s    r0   _normalize_textr:      s*     88DJJLL&&(()))r2   c                  (    e Zd ZU dZded<   ded<   dS )TextSegmenta  An annotated string from a Phrasing element.

    Annotations are for emphasis and for links. The text includes any leading, trailing, and
    inter-word whitespace, just as it occurred in the HTML. The text-segments for a paragraph are
    consolidated once the paragraph is fully parsed and whitespace it normalized at that time. It
    cannot be normalized prior to that without distoring or losing inter-word spacing.

    However, text within annotations, like the text of a link, is normalized since its full extents
    are known.
    r%   r3   r!   
annotationN)__name__
__module____qualname____doc____annotations__ r2   r0   r<   r<      s3         	 	 IIIr2   r<   Phrasec                  (    e Zd ZdZd ZddZdd	Zd
S )_PhraseAccumulatora  Accumulates sequential `TextSegment`s making them available as iterable on flush().

    - The accumulator starts empty.
    - `.flush()` is a Phrase iterator and generates zero or one Phrase.
    - `.flush()` generates zero items when no text-segments have been accumulated
    - `flush()` resets the accumulator to its initial empty state.

    So far, phrases are used only by the Anchor class.
    c                    g | _         d S N)_text_segmentsselfs    r0   __init__z_PhraseAccumulator.__init__   s    13r2   text_segmentr<   r#   Nonec                :    | j                             |           dS )z&Add `text_segment` to this collection.NrI   r*   rK   rM   s     r0   addz_PhraseAccumulator.add       ""<00000r2   Iterator[Phrase]c              #     K   | j         dd         }| j                                          |sdS t          |          V  dS )zMGenerate each of the stored `TextSegment` objects and clears the accumulator.N)rI   cleartuple)rK   text_segmentss     r0   flushz_PhraseAccumulator.flush   sU       +AAA.!!### 	FM"""""""r2   NrM   r<   r#   rN   )r#   rT   )r>   r?   r@   rA   rL   rR   rY   rC   r2   r0   rF   rF      sU         4 4 41 1 1 1	# 	# 	# 	# 	# 	#r2   rF   c                  J    e Zd ZdZddZdd	ZddZddZedd            Z	dS )_ElementAccumulatora"  Accumulates sequential `TextSegment`s and forms them into an element on flush().

    The text segments come from element text or tails and any contiguous phrasing elements that
    follow that text or tail.

    - The accumulator starts empty.
    - `.flush()` is an element iterator and generates zero or one Element.
    - `.flush()` generates zero elements when no text-segments have been accumulated or the ones
      that have been accumulated contain only whitespace.
    - `flush()` resets the accumulator to its initial empty state.
    elementetree.ElementBasec                "    || _         g | _        d S rH   )_elementrI   )rK   r]   s     r0   rL   z_ElementAccumulator.__init__   s    13r2   rM   r<   r#   rN   c                :    | j                             |           dS )z6Add `text_segment` to this Element-under-construction.NrP   rQ   s     r0   rR   z_ElementAccumulator.add   rS   r2   
ElementClstype[Element] | NoneIterator[Element]c              #  h  K   | j         }| j        dd         }| j                                         |r|sdS |/t          |          }|dS |t          u rt          |          }|sdS |                     |          } ||t          di t          d |D                       d|i          V  dS )zIGenerate zero-or-one document-`Element` object and clear the accumulator.Nc              3  $   K   | ]}|j         V  d S rH   r=   .0tss     r0   	<genexpr>z,_ElementAccumulator.flush.<locals>.<genexpr>  s$      *Q*QR2=*Q*Q*Q*Q*Q*Qr2   category_depthmetadatarC   )	_normalized_textrI   rV   derive_element_type_from_textr   r   _category_depthr   r1   )rK   rb   normalized_textrX   rl   s        r0   rY   z_ElementAccumulator.flush   s      / +AAA.!!### 	O 	F 6GGJ !X%%"/"@"@& F--j99j$  **Q*Q=*Q*Q*QQQ -  
 
 
 	
 	
 	
 	
 	
r2   type[Element]
int | Nonec                   |t           u r@| j        j        dv r0t          d | j                                        D                       ndS |t
          u r2| j        j        dv r"t          | j        j        d                   dz
  ndS dS )z8Not clear on concept. Something to do with hierarchy ...)liddc                $    g | ]}|j         d v |S ))dlolul)tag)ri   es     r0   
<listcomp>z7_ElementAccumulator._category_depth.<locals>.<listcomp>  s%    ]]]1J\A\A\QA\A\A\r2   r   )h1h2h3h4h5h6   N)r   r`   r|   leniterancestorsr   int)rK   rb   s     r0   rq   z#_ElementAccumulator._category_depth  s    !! =$44 ]] ; ; = =]]]^^^  =$(LLL DM%a())A-- tr2   r%   c                    d                     d                     d | j        D                                                                 S )aV  Consolidate text-segment text values into a single whitespace-normalized string.

        This normalization is suitable for text inside a block element including any segments from
        phrasing elements immediately following that text. The spec is:

        - All text segments are concatenated (without adding or removing whitespace)
        - Leading and trailing whitespace are removed.
        - Each run of whitespace in the string is reduced to a single space.

        For example:
          "  
   foo  bar
baz bada 	 bing
  "
        becomes:
          "foo bar baz bada bing"
        r5    c              3  $   K   | ]}|j         V  d S rH   r9   rh   s     r0   rk   z7_ElementAccumulator._normalized_text.<locals>.<genexpr>4  s$      FFBFFFFFFr2   )r6   rI   r8   rJ   s    r0   ro   z$_ElementAccumulator._normalized_text$  s@      xxFF$2EFFFFFLLNNOOOr2   N)r]   r^   rZ   )rb   rc   r#   rd   )rb   rs   r#   rt   r#   r%   )
r>   r?   r@   rA   rL   rR   rY   rq   propertyro   rC   r2   r0   r\   r\      s        
 
4 4 4 41 1 1 1!
 !
 !
 !
F   $ P P P XP P Pr2   r\   c                  *    e Zd ZdZedd            ZdS )_PreElementAccumulatorzXAccumulator specific to `<pre>` element, preserves (most) whitespace in normalized text.r#   r%   c                    d                     d | j        D                       }|                    d          rdnd}|                    d          rdnt	          |          }|||         S )zConsolidate `texts` into a single whitespace-normalized string.

        This normalization is specific to the `<pre>` element. Only a leading and or trailing
        newline is removed. All other whitespace is preserved.
        r   c              3  $   K   | ]}|j         V  d S rH   r9   rh   s     r0   rk   z:_PreElementAccumulator._normalized_text.<locals>.<genexpr>A  s$      ==2rw======r2   
r   r   )r6   rI   
startswithendswithr   )rK   r3   startends       r0   ro   z'_PreElementAccumulator._normalized_text:  so     ww==)<=====__T**1MM$''6bbSYYE#Ir2   Nr   )r>   r?   r@   rA   r   ro   rC   r2   r0   r   r   7  s8        bb   X  r2   r   c                  b    e Zd ZdZdZedd            ZddZedd	            Z		 dddZ
ddZdS )Flowz~Base and default class for elements that act like a div.

    These can contain other flow elements or phrasing elements.
    Nr#   boolc                    dS )NFrC   rJ   s    r0   is_phrasingzFlow.is_phrasingZ  s    ur2   rd   c              #  l  K   t          |           }|                     | j        pd|| j                  E d{V  |rw|d         j        rJ t          t          |                                          }|                                E d{V  |                     |j	        pd|          E d{V  |udS dS )5Generate paragraph string for each block item within.r   Nr   )
r   _element_from_text_or_tailr3   _ElementClsr   r   r   popleftiter_elementstail)rK   q
block_items      r0   r   zFlow.iter_elements^  s       %*$KK2249?AtGWXXXXXXXXX 	Qt''''dAIIKK00J!//11111111166z7L"aPPPPPPPPP	  	Q 	Q 	Q 	Q 	Qr2   r\   c                     t          |           S z9Text-segment accumulator suitable for this block-element.)r\   rJ   s    r0   _element_accumzFlow._element_accumk  s     #4(((r2   r3   r%   r   deque[Flow | Phrasing]rb   rc   c              #    K   | j         }|                     ||          D ]L}t          |t                    r|                    |           -|                    |          E d{V  |V  M|                    |          E d{V  dS )zGenerate zero-or-one paragraph formed from text and leading phrasing elements.

        Note this mutates `q` by popping phrasing elements off as they are processed.
        N)r   _iter_text_segmentsr(   r<   rR   rY   )rK   r3   r   rb   element_accumnodes         r0   r   zFlow._element_from_text_or_tailp  s       +,,T155 	 	D$,, !!$'''' )..z:::::::::



 &&z22222222222r2   Iterator[TextSegment | Element]c              #     K   t          |i           V  |r_|d         j        rTt          t          |                                          }|                                E d{V  |r|d         j        PdS dS dS dS )a  Generate zero-or-more `TextSegment`s or `Element`s from text and leading phrasing.

        Note that while this method is named "._iter_text_segments()", it can also generate
        `Element` objects when a block item is nested within a phrasing element. This is not
        technically valid HTML, but folks write some wacky HTML and the browser is pretty forgiving
        so we try to do the right thing (what the browser does) when that happens, generally
        interpret each nested block as its own paragraph and generate a separate `Element` object
        for each.

        This method is used to process the text or tail of a block element, including any phrasing
        elements immediately following the text or tail.

        For example, this <div>:

            <div>
               For a <b>moment, <i>nothing</i> happened.</b>
               <p>Then, after a second or so, nothing continued to happen.</p>
               The dolphins had always believed that <em>they</em> were far more intelligent.
            </div>

        Should generate three distinct elements:
        - One for the div's text "For a " and the <b> phrasing element after it,
        - one for the <p> element, and
        - one for the tail of the <p> and the phrasing <em> element that follows it.

        This method is invoked to process the first line beginning "For a" and the third line
        beginning "The dolphins", in two separate calls.

        Note this method mutates `q` by popping phrasing elements off as they are processed.
        r   N)r<   r   r   Phrasingr   iter_text_segments)rK   r3   r   r}   s       r0   r   zFlow._iter_text_segments  s      B $##### 	.AaD$ 	.Xqyy{{++A++---------  	.AaD$ 	. 	. 	. 	. 	. 	. 	. 	. 	.r2   r#   r   r#   rd   r#   r\   rH   )r3   r%   r   r   rb   rc   r#   rd   )r3   r%   r   r   r#   r   )r>   r?   r@   rA   r   r   r   r   r    r   r   r   rC   r2   r0   r   r   Q  s          K   XQ Q Q Q ) ) ) \)
 X\3 3 3 3 3&$. $. $. $. $. $.r2   r   c                      e Zd ZdZdS )	BlockItemzCustom element-class for `<p>` element, `<h1>`, and others like it.

    These can appear in a flow container like a div but can only contain phrasing content.
    Nr>   r?   r@   rA   rC   r2   r0   r   r     s           r2   r   c                      e Zd ZdZeZdS )HeadingzcAn `<h1>..<h6>` element.

    These are distinguished because they generate a `Title` element.
    N)r>   r?   r@   rA   r   r   rC   r2   r0   r   r     s         
 KKKr2   r   c                      e Zd ZdZdS )	ListBlockax  Either a `<ul>` or `<ol>` element, maybe a `<dl>` element at some point.

    The primary reason for distinguishing these is because they increment the hierarchy depth for
    lists that are nested inside them.

    Can only contain `<li>` elements (ignoring `<script>` and `<template>`). A list nested inside
    must actually be a child of one of these `<li>` elements.
    Nr   rC   r2   r0   r   r     s           r2   r   c                      e Zd ZdZeZdS )ListItemBlockz_A `<li>` element.

    These are distinguished because they generate a `ListItem` element.
    N)r>   r?   r@   rA   r   r   rC   r2   r0   r   r     s         
 KKKr2   r   c                  *    e Zd ZdZedd            ZdS )PrezVCustom element-class for `<pre>` element.

    Can only contain phrasing content.
    r#   r\   c                     t          |           S r   )r   rJ   s    r0   r   zPre._element_accum  s     &d+++r2   Nr   )r>   r?   r@   rA   r    r   rC   r2   r0   r   r     s>         
 , , , \, , ,r2   r   c                      e Zd ZdZddZdS )
TableBlockz+Custom element-class for `<table>` element.r#   Iterator[Table]c              #  z  K   t          t          t          j                 |                     d                    }|sdS ddfd|D             }t          |          }d	                    d
 |D                                                       }|dk    rdS t          |t          |                    V  dS )r   z+./tr | ./thead/tr | ./tbody/tr | ./tfoot/trNtretree._Elementr#   Iterator[str]c              3    K   t          t          t          j                 |                     d                    }|D ]A}d |                                D             }d                    d |D                       V  BdS )z'Generate the text of each cell in `tr`.z./td | ./thc              3  >   K   | ]}|                                 V  d S rH   )r7   ri   ts     r0   rk   zDTableBlock.iter_elements.<locals>.iter_cell_texts.<locals>.<genexpr>  s*      &H&HQqwwyy&H&H&H&H&H&Hr2   r5   c              3     K   | ]}||V  	d S rH   rC   r   s     r0   rk   zDTableBlock.iter_elements.<locals>.iter_cell_texts.<locals>.<genexpr>  s'      CCQCqCCCCCCr2   N)r   r&   r   _Elementxpathitertextr6   )r   tdstdstripped_text_nodess       r0   iter_cell_textsz1TableBlock.iter_elements.<locals>.iter_cell_texts  s       tEN+RXXm-D-DEEC D D 'I&H"++--&H&H&H#hhCC*=CCCCCCCCCD Dr2   c                @    g | ]}t           |                    S rC   )r&   )ri   r   r   s     r0   r~   z,TableBlock.iter_elements.<locals>.<listcomp>  s+    >>>Bd??2..//>>>r2   r5   c              3  T   K   | ]#}d                      d |D                       V  $dS )r5   c              3     K   | ]}||V  	d S rH   rC   r   s     r0   rk   z5TableBlock.iter_elements.<locals>.<genexpr>.<genexpr>  s'      &;&;Q&;q&;&;&;&;&;&;r2   N)r6   )ri   rows     r0   rk   z+TableBlock.iter_elements.<locals>.<genexpr>  s=      RRchh&;&;#&;&;&;;;RRRRRRr2   r   )text_as_htmlrm   )r   r   r#   r   )
r   r&   r   r   r   r   r6   r7   r   r   )rK   trs
table_data
html_table
table_textr   s        @r0   r   zTableBlock.iter_elements  s       4'4a)b)bcc 	F		D 		D 		D 		D ?>>>#>>>
1*==
XXRRzRRRRRXXZZ
FJj)Q)Q)QRRRRRRRRr2   N)r#   r   r>   r?   r@   rA   r   rC   r2   r0   r   r     s4        55S S S S S Sr2   r   c                      e Zd ZdZddZdS )RemovedBlocka  Elements that are to be ignored.

    An element may be ignored because it commonly contains boilerplate that would dilute the meaning
    extracted rather than contribute to it.

    All contents of a removed block item are ignored but its tail is emitted by its container.
    r#   rd   c              #  
   K   dS )z%Don't generate any document-elements.NrC   rJ   s    r0   r   zRemovedBlock.iter_elements  s      r2   Nr   r   rC   r2   r0   r   r     s2              r2   r   c                  d    e Zd ZdZedd            Zddd	ZddZddZddZ	ddZ
ddZddZdS ) r   zCBase-class for phrasing (inline/run) elements like bold and italic.r#   r   c                    dS )NTrC   rJ   s    r0   r   zPhrasing.is_phrasing  s    tr2   r   enclosing_emphasisr%   r   c              #     K   |                      |          }|                     |          E d{V  |                     |          E d{V  |                     |          E d{V  dS )DGenerate text segments for text, children, and tail of this element.N)_inside_emphasis_iter_text_segment_iter_child_text_segments_iter_tail_segment)rK   r   inside_emphasiss      r0   r   zPhrasing.iter_text_segments  s      //0BCC**?;;;;;;;;; 11/BBBBBBBBB
 **+=>>>>>>>>>>>r2   r3   emphasisr!   c                P    t          t          |          x}r|r||dni           S )a  Emphasis annotations that apply to text inside this element.

        No annotations are added when the text contains only whitespace. Otherwise, emphasis
        annotations are returned for the text contents, normalized as it will appear in the
        document-element.

        Emphasis annotations apply to the contents of all elements enclosed by the emphasis element.
        Sub-classes like the one for anchor elements that add non-emphasis annotations will need to
        override this method.
        )emphasized_text_contentsemphasized_text_tags)r   r:   )rK   r3   r   rr   s       r0   _annotationzPhrasing._annotation,  sD      #24#8#88>FRZ[[[
 
 	
r2   c                    |S )zBy default, the inside emphasis is the same as the outside emphasis.

        This method is overridden by sub-classes that annotate particular emphasis types but many
        phrasing elements do not contribute to annotations.
        rC   rK   r   s     r0   r   zPhrasing._inside_emphasis?  s
     "!r2   c              #  j  K   t          |           }|r|                                }|j        r/t          t          |                              |          E d{V  nQt          t          |                                          E d{V  |                     |j	        pd||          E d{V  |dS dS )zGenerate zero-or-more text-segments for phrasing children of this element.

        All generated text segments will be annotated with `emphasis` when it is other than the
        empty string.
        Nr   )
r   r   r   r   r   r   r   r   0_iter_text_segments_from_block_tail_and_phrasingr   )rK   r   r   childs       r0   r   z"Phrasing._iter_child_text_segmentsG  s       %*$KK  	IIKKE  %00CCHMMMMMMMMMMe,,::<<<<<<<<<PPJ$"a          	 	 	 	 	r2   Iterator[TextSegment]c              #  l   K   | j         x}r(t          ||                     ||                    V  dS dS )zGenerate zero-or-one text-segment for tail of this element.

        No text-segment is generated when this element has no tail node. However a segment _is_
        generated for a whitespace-only tail node.
        N)r   r<   r   )rK   r   r   s      r0   r   zPhrasing._iter_tail_segment[  P       94 	FdD$4$4T8$D$DEEEEEEE	F 	Fr2   c              #  l   K   | j         x}r(t          ||                     ||                    V  dS dS )zGenerate zero-or-one text-segment for text of this element.

        No text-segment is generated when this element has no text node. However a segment _is_
        generated for a whitespace-only text node.
        N)r3   r<   r   )rK   r   r3   s      r0   r   zPhrasing._iter_text_segmentd  r   r2   r   r   r   c              #  &  K   |r&t          ||                     ||                    V  |r`|d         j        rUt          t          |                                          }|                    |          E d{V  |r|d         j        QdS dS dS dS )a  Generate zero-or-more `TextSegment`s or `Element`s from tail+phrasing of block child.

        When this phrasing element contains a block child (not valid HTML but accepted by
        browsers), the tail of that block child and any phrasing elements contiguous with that tail
        also need to contribute their text. This method takes care of that job.

        Note this mutates `q` by popping phrasing elements off as they are processed.
        r   N)r<   r   r   r   r   r   r   )rK   r   r   r   r}   s        r0   r   z9Phrasing._iter_text_segments_from_block_tail_and_phrasingm  s        	FdD$4$4T8$D$DEEEEE 	6AaD$ 	6Xqyy{{++A++H555555555  	6AaD$ 	6 	6 	6 	6 	6 	6 	6 	6 	6r2   Nr   r   r   r%   r#   r   )r3   r%   r   r%   r#   r!   r   r%   r#   r%   )r   r%   r#   r   )r   r%   r#   r   )r   r%   r   r   r   r%   r#   r   )r>   r?   r@   rA   r   r   r   r   r   r   r   r   r   rC   r2   r0   r   r     s        MM   X? ? ? ? ?
 
 
 
&" " " "   (F F F FF F F F6 6 6 6 6 6r2   r   c                  <    e Zd ZdZdddZdd
ZddZddZddZdS )AnchorzLCustom element-class for `<a>` element.

    Provides link annotations.
    r   r   r%   r#   r   c              #    K   t          |                     |                    }|rg|                                }t          |t                    r|                     |          V  n&|                     |          x}r|V  n
|E d{V  |g|r8|                                }t          |t                    r|V  n|E d{V  |8|                     |          E d{V  dS )a  Generate text segments for contents and tail of this element, when they exist.

        Phrasing is emitted as `TextSegment` objects. Any nested block items (not valid HTML but
        are accepted by browser so can occur) are emitted as `Element` objects.

        When an anchor contains a nested block element, there can be multiple phrases and/or
        elements. Link annotation is only added to the first phrase or element. Otherwise the link
        annotation would span multiple document-elements.
        N)r   _iter_phrases_and_elementsr   r(   r   _link_annotate_element_link_text_segmentr   )rK   r   r   xltss        r0   r   zAnchor.iter_text_segments  s,      &+4+J+JK]+^+^%_%_  	!		A!W%% 	!11!44444 11!4443 !III LLLLLLL  	!  			A!W%%   	 **+=>>>>>>>>>>>r2   r   Iterator[Phrase | Element]c              #  d  K   t          |           }|                     | j        pd||          E d{V  |rx|d         j        rJ t	          t
          |                                          }|                                E d{V  |                     |j        pd||          E d{V  |vdS dS )zQDivide contents (text+children, but not tail) into phrases and document-elements.r   Nr   )	r   _iter_phrasingr3   r   r   r   r   r   r   )rK   r   r   r   s       r0   r   z!Anchor._iter_phrases_and_elements  s       %*$KK&&tyB8DDDDDDDDD 	Ot''''dAIIKK00J!//111111111**:?+@b!XNNNNNNNNN	  	O 	O 	O 	O 	Or2   r3   r   r   c           	   #    K   t                      }|r7|                    t          ||                     ||                               |r|d         j        rt          t          |                                          }|                    |          D ]K}t          |t                    r|                    |           -|
                                E d{V  |V  L|r|d         j        |
                                E d{V  dS )a8  Generate zero-or-more `TextSegment`s or `Element`s from text and leading phrasing.

        Note that while this method is named "._iter_phrasing()", it can also generate `Element`
        objects when a block item is nested within a phrasing element. This is not technically
        valid HTML, but folks write some wacky HTML and the browser is pretty forgiving so we try
        to do the right thing (what the browser does) when that happens, generally interpret each
        nested block as its own paragraph and generate a separate `Element` object for each.

        This method is used to process the text or tail of a block element, including any phrasing
        elements immediately following the text or tail.

        Note this method mutates `q` by popping phrasing elements off as they are processed.
        r   N)rF   rR   r<   r   r   r   r   r   r   r(   rY   )rK   r3   r   r   phrase_accumr}   r  s          r0   r  zAnchor._iter_phrasing  s2       *++ 	R[t/?/?h/O/OPPQQQ 	AaD$ 	Xqyy{{++A))(33  a--  $$Q''''  ,11333333333GGGG  	AaD$ 	  %%'''''''''''r2   r]   r   c                    |j         }|                     d          }|r|s|S |j        j        pg |gz   |j        _        |j        j        pg |gz   |j        _        |S )z8Apply this link's annotation to `element` and return it.href)r3   getrn   
link_texts	link_urls)rK   r]   	link_textlink_urls       r0   r  zAnchor._link_annotate_element  sm    L	88F## 	 	N'.'7'B'HbYK&W#&-&6&@&FB8*%T"r2   phraserD   TextSegment | Nonec                    d                     d |D                       }t          |          }|                     d          }|r|sdS t          |g|gdgd |D             R           }t	          ||          S )zConsolidate `phrase` into a single text-segment with link annotation.

        Returns None if the phrase contains only whitespace.
        r   c              3  $   K   | ]}|j         V  d S rH   r9   ri   rM   s     r0   rk   z,Anchor._link_text_segment.<locals>.<genexpr>  s%      #Q#Q,L$5#Q#Q#Q#Q#Q#Qr2   r  N)r  r  c              3  $   K   | ]}|j         V  d S rH   rg   r  s     r0   rk   z,Anchor._link_text_segment.<locals>.<genexpr>  s%      EEl,)EEEEEEr2   )r6   r:   r  r1   r<   )rK   r  consolidated_textr  r  consolidated_annotationss         r0   r  zAnchor._link_text_segment  s    
 GG#Q#Q&#Q#Q#QQQ#$566	88F## 	 	4 $< ){(DDEEfEEE $
 $
  ,.FGGGr2   Nr   r   )r   r%   r#   r  )r3   r%   r   r   r   r%   r#   r  )r]   r   r#   r   )r  rD   r#   r  )	r>   r?   r@   rA   r   r   r  r  r  rC   r2   r0   r   r     s         
$? $? $? $? $?LO O O O (  (  (  (D   H H H H H Hr2   r   c                      e Zd ZdZddZdS )Boldz*Provides annotations for bold/strong text.r   r%   r#   c                j    t          |dz             }d                    t          |                    S )a  Emphasis tags that apply to text inside this element.

        Formed by adding "b" (for "bold") to the enclosing emphasis, unless it's already there.
        The returned emphasis tuple is sorted to make its form canonical, which eases testing. For
        Example `("b", "i")` and `("i", "b")` are semantically the same but don't directly compare
        equal in a test. Sorting it basically gives it some set-like properties.
        br   setr6   sortedrK   r   charss      r0   r   zBold._inside_emphasis  s/     &,--wwve}}%%%r2   Nr   r>   r?   r@   rA   r   rC   r2   r0   r  r    s.        44	& 	& 	& 	& 	& 	&r2   r  c                      e Zd ZdZddZdS )Italicz0Provides annotations for italic/emphasized text.r   r%   r#   c                j    t          |dz             }d                    t          |                    S )zEmphasis tags that apply to text inside this element.

        Formed by adding "i" (for "italic") to the enclosing emphasis, unless it's already there.
        ir   r  r   s      r0   r   zItalic._inside_emphasis  s/    
 &,--wwve}}%%%r2   Nr   r"  rC   r2   r0   r$  r$    s.        ::& & & & & &r2   r$  c                      e Zd ZdZd	d
dZdS )	LineBreakzA `<br/>` line-break element.

    It's only special behavior is to add whitespace such that phrasing tight on both sides is not
    joined, like `abc<br/>def` should become "abc def", not "abcdef".
    r   r   r%   r#   r   c              #  d   K   t          di           V  |                     |          E d{V  dS )r   r   N)r<   r   r   s     r0   r   zLineBreak.iter_text_segments   sI      $#####**+=>>>>>>>>>>>r2   Nr   r   r%   r#   r   r>   r?   r@   rA   r   rC   r2   r0   r(  r(    s7         ? ? ? ? ? ? ?r2   r(  c                      e Zd ZdZd	d
dZdS )RemovedPhrasingzPhrasing where we want to skip the content.

    - `.is_phrasing` is True so it doesn't break the paragraph like a block.
    - `element.text` is discarded
    - `element.tail` is preserved
    r   r   r%   r#   r   c              #  @   K   |                      |          E d{V  dS )z4Generate text segment for tail only of this element.Nr   r   s     r0   r   z"RemovedPhrasing.iter_text_segments.  s3      **+=>>>>>>>>>>>r2   Nr   r*  r+  rC   r2   r0   r-  r-  &  s7         ? ? ? ? ? ? ?r2   r-  c                  <    e Zd ZdZedd            ZddZdddZdS )DefaultElementa  Custom element-class used for any element without an assigned custom element class.

    An unrecognized element is given both Flow (block) and Phrasing (inline) behaviors. It behaves
    like a Flow element When nested in a Flow element like a Phrasing element when nested in a
    Phrasing element.

    The contents of the element is skipped in either case, but its tail is not when it behaves as a
    Phrasing element. The tail is processed by its parent when that is a Flow element.
    r#   r   c                    dS )af  If asked (by a parent Flow element), identify as a phrasing element.

        It's not possible to determine the display intent (block|inline) of an unknown element
        (like `<foobar>`) and phrasing is less disruptive, adding the tail of this element to any
        text or phrasing content before and after it without starting a new paragraph.
        TrC   rJ   s    r0   r   zDefaultElement.is_phrasingA  s	     tr2   rd   c              #  
   K   dS )ae  Don't generate any document-elements when behaving like a Flow element.

        Because the element identifies as phrasing and will always be enclosed by at least a
        `<body>` element, this method should never be called. However, it's easier to prove it does
        the appropriate thing if it is called than prove that it can never happen.
        NrC   rJ   s    r0   r   zDefaultElement.iter_elementsK  s       	r2   r   r   r%   r   c              #  @   K   |                      |          E d{V  dS )a1  Generate text segment for tail of this element only.

        This method is only called on Phrasing elements and their children. In that case, act like a
        Phrasing element but don't generate a text segment for this element or any children. Do
        however generate a tail text-segment.
        Nr/  r   s     r0   r   z!DefaultElement.iter_text_segmentsU  s5       **+=>>>>>>>>>>>r2   Nr   r   r   r*  )r>   r?   r@   rA   r   r   r   r   rC   r2   r0   r1  r1  6  sk             X   
? 
? 
? 
? 
? 
? 
?r2   r1  type[Text] | Nonec                   t          |           rt          S t          |           rt          S t	          |           rt
          S t          |           dk     rdS t          |           rt          S t          |           rt          S t          S )zBProduce a document-element of the appropriate sub-type for `text`.   N)r   r   r   r   r   r   r   r   r   r   r   r   r9   s    r0   rp   rp   g  s     D!!  
4yy1}}t!$''   Kr2   T)remove_comments)r]   Naddressarticleaside
blockquotebodycenterdivfooterheaderhgroupmainsectionr   r   r   r   r   r   pprerz   r{   rv   tabler-   r  emr&  strongabbrbdibdobigbrcitecodedatadfnkbdmarkmeterr   ssampsmallspanstrikesubsuptimettuvarwbrbuttonlabeldetailsry   rw   dtfigurehrnavtemplateforminputsummary)r   r"   r#   r!   )r3   r%   r#   r%   )r3   r%   r#   r5  )NrA   
__future__r   collectionsr   r   typesr   typingr   r   r	   r
   r   r   r   lxmlr   typing_extensionsr   unstructured.cleaners.corer   unstructured.common.html_tabler   unstructured.documents.elementsr   r   r   r   r   r   r   r   r    unstructured.partition.text_typer   r   r   r   r   unstructured.utilsr    r%   r!   rB   r1   r:   r<   rD   rF   r\   r   ElementBaser   r   r   r   r   r   r   r   r   r   r  r$  r(  r-  r1  rp   
HTMLParserhtml_parserElementDefaultClassLookupfallbackElementNamespaceClassLookupelement_class_lookupset_element_class_lookupget_namespaceupdaterC   r2   r0   <module>r     s	  G G GR # " " " " " * * * * * * * * " " " " " " O O O O O O O O O O O O O O O O O O       ' ' ' ' ' ' 4 4 4 4 4 4 G G G G G G
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
              , + + + + +  S)
 ) ) ) )8 8 8 8 	* 	* 	* 	*    *     [) ) ) ) ).# # # # # # # #<ZP ZP ZP ZP ZP ZP ZP ZPz    0   4V. V. V. V. V.5 V. V. V.r           d              D   	, 	, 	, 	, 	,) 	, 	, 	,!S !S !S !S !S !S !S !SH    4   $e6 e6 e6 e6 e6u  e6 e6 e6P|H |H |H |H |HX |H |H |H~& & & & &8 & & &	& 	& 	& 	& 	&X 	& 	& 	&
? 
? 
? 
? 
? 
? 
? 
?
? 
? 
? 
? 
?h 
? 
? 
? )? )? )? )? )?T8 )? )? )?b   J et444*5*>BBB8u8BB   $ $%9 : : :  " "4 ( ( / /M4M 	4M 		M
 	dM 	M 	$M 	tM 	$M 	$M 	$M 	M 	4M 	gM  	g!M" 	g#M$ 	g%M& 	g'M M( 	g)M* 	Y+M, 	s-M0 	i1M2 	i3M4 	m5M8 	9M< 	V=M> 	T?M@ 	fAMB 	VCMD 	$EMH 	IMJ 	xKML 	xMMN 	xOMP 	iQM M MR 	SMT 	UMV 	WMX 	xYMZ 	x[M\ 	]M^ 	_M` 	XaMb 	XcMd 	eMf 	gMh 	iMj 	(kMl 	xmMn 	xoMp 	qMr 	hsM M Mt 	XuMv 	xwMx 	xyM| 	/}M~ 	MB 	<CMD 	lEMF 	lGMH 	lIMJ 	,KML 	lMMN 	|OMP 	LQMT 	UMV 	WMX 	<YM MO O O O Or2   