
    Ng\                    4   d Z ddlmZ ddlmZ ddlZddlmZ ddlm	Z	m
Z
mZmZmZmZ ddlmZmZmZmZmZmZmZmZmZmZmZmZ d Zej                            d	g d
          d)d            Z G d d          Z  G d d          Z! G d d          Z" G d d          Z# G d d          Z$ G d d          Z% G d d          Z& G d d          Z' G d d           Z( G d! d"          Z) G d# d$          Z* G d% d&          Z+ G d' d(          Z,dS )*z;Test suite for `unstructured.partition.html.parser` module.    annotations)dequeN)etree)AddressElementListItemNarrativeTextTextTitle)
AnnotationDefaultElementFlowPhrasingRemovedPhrasingTextSegment_consolidate_annotations_ElementAccumulator_normalize_text_PhraseAccumulator_PreElementAccumulatorhtml_parserc                 (   ddddddddg} t          |           } | ddgddgdgdgdk    sJ t          j        t          d	
          5  d| d<   d d d            n# 1 swxY w Y   | d                             d           | d         g dk    sJ d S )NzFord Prefectzhttps://wikipedia/Ford_Prefectb)
link_textslink_urlemphasized_text_contentsemphasized_text_tagszalien encounterbir   r   )r   r   r   r   z'object does not support item assignment)matchfoobarnew_keyr   xyz)r   r   r$   )r   pytestraises	TypeErrorappendr   s    h/var/www/html/ai-engine/env/lib/python3.11/site-packages/test_unstructured/partition/html/test_parser.py7it_consolidates_annotations_from_multiple_text_segmentsr*   "   s6    )8(6$'		
 	
 ):$(	
 	
K +;77K%35F$G!$d%&56      
y(Q	R	R	R * *!)I* * * * * * * * * * * * * * * &'..u555-.2D2D2DDDDDDDs   
AA #A textexpected_value))iterators allowr.   )zalgorithm
to   bezalgorithm to be)z  separated
  from  zseparated from)z
 container
 details
 zcontainer details)zM
  iterators  allow 
 algorithm to be   
expressed  without container  
noisezAiterators allow algorithm to be expressed without container noiser,   strr-   c                0    t          |           |k    sJ d S N)r   r+   s     r)   ,test_normalize_text_produces_normalized_textr2   E   s#      4  N222222    c                  $    e Zd ZdZd Zd Zd ZdS )Describe_PhraseAccumulatorzUIsolated unit-test suite for `unstructured.partition.html.parser._PhraseAccumulator`.c                    t                      }|                                }t          j        t                    5  t          |           d d d            d S # 1 swxY w Y   d S r1   r   flushr%   r&   StopIterationnextselfaccumphrase_iters      r)   it_is_empty_on_constructionz6Describe_PhraseAccumulator.it_is_empty_on_construction^       "$$kkmm]=)) 	 		 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	   AA Ac                   t                      }|                    t          di                      |                    t          di                      |                                }t	          |          }|t          di           t          di           fk    sJ t          j        t                    5  t	          |           d d d            d S # 1 swxY w Y   d S )NFord... you're turning into a penguin.)r   addr   r8   r:   r%   r&   r9   )r<   r=   r>   phrases       r)   it_accumulates_text_segmentsz7Describe_PhraseAccumulator.it_accumulates_text_segmentsh   s   "$$		+7<<===		+/44555kkmmk""1266)2..
 
 
 
 

 ]=)) 	 		 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   7CCCc                    t                      }|                                }t          j        t                    5  t          |           d d d            d S # 1 swxY w Y   d S r1   r7   r;   s      r)   -it_generates_zero_phrases_on_flush_when_emptyzHDescribe_PhraseAccumulator.it_generates_zero_phrases_on_flush_when_emptyz   r@   rA   N)__name__
__module____qualname____doc__r?   rG   rI    r3   r)   r5   r5   [   sG        __    $    r3   r5   c                  d   e Zd ZdZd6dZd6dZd6dZd6dZd6dZd6d	Z	d6d
Z
d6dZd6dZd Zd6dZej                            dddedfddedfddedfddedfddedfddedfddedfddedfd d!edfd"d#ed$fd%d&ed'fd(d)ed*fg          d7d2            Zd6d3Z ej                    d8d5            ZdS )9Describe_ElementAccumulatorzVIsolated unit-test suite for `unstructured.partition.html.parser._ElementAccumulator`.html_elementetree.ElementBasec                    t          |          }|                    d           }t          j        t                    5  t          |           d d d            d S # 1 swxY w Y   d S r1   r   r8   r%   r&   r9   r:   r<   rQ   r=   element_iters       r)   r?   z7Describe_ElementAccumulator.it_is_empty_on_construction       #L11{{4((]=)) 	 		 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	   AA"Ac                   t          |          }|                    t          di                      |                    t          di                      |                    d           }t	          |          }|t          d          k    sJ t          j        t                    5  t	          |           d d d            d S # 1 swxY w Y   d S )NrC   rD   &Ford... you're turning into a penguin.)	r   rE   r   r8   r:   r
   r%   r&   r9   )r<   rQ   r=   rV   elements        r)   rG   z8Describe_ElementAccumulator.it_accumulates_text_segments   s   #L11		+7<<===		+/44555{{4((|$$-(PQQQQQQ]=)) 	 		 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   (CC	C	c                    t          |          }|                    d           }t          j        t                    5  t          |           d d d            d S # 1 swxY w Y   d S r1   rT   rU   s       r)   %it_generates_zero_elements_when_emptyzADescribe_ElementAccumulator.it_generates_zero_elements_when_empty   rW   rX   c                X   t          |          }|                    t          di                      |                    t          di                      t          j        t
                    5  t          |                    d                      d d d            d S # 1 swxY w Y   d S )N 
   	 
z   
r   rE   r   r%   r&   r9   r:   r8   r<   rQ   r=   s      r)   Mand_it_generates_zero_elements_when_all_its_text_segments_are_whitespace_onlyziDescribe_ElementAccumulator.and_it_generates_zero_elements_when_all_its_text_segments_are_whitespace_only        $L11		+mR00111		+gr**+++]=)) 	$ 	$T""###	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$   /#BB#&B#c                X   t          |          }|                    t          di                      |                    t          di                      t          j        t
                    5  t          |                    d                      d d d            d S # 1 swxY w Y   d S )Nr_   z X 
r`   ra   s      r)   Nand_it_generates_zero_elements_when_there_is_only_one_non_whitespace_characterzjDescribe_ElementAccumulator.and_it_generates_zero_elements_when_there_is_only_one_non_whitespace_character   rc   rd   c                    t          |          }|                    t          di                      |                    t          di                      |                    d           \  }|j        dk    sJ d S Nz 
  Ford...   you're 	 turning
zinto a   penguin.
rZ   )r   rE   r   r8   r,   r<   rQ   r=   r[   s       r)   4it_normalizes_the_text_of_its_text_segments_on_flushzPDescribe_ElementAccumulator.it_normalizes_the_text_of_its_text_segments_on_flush   sv    #L11		+BBGGHHH		+3R88999[[&&
|GGGGGGGr3   c                    t          |          }|                    t          di                      |                    t                    \  }|t	          d          k    sJ d S NrZ   r   rE   r   r8   r	   ri   s       r)   3it_creates_a_document_element_of_the_specified_typezODescribe_ElementAccumulator.it_creates_a_document_element_of_the_specified_type   s_    #L11		+FKKLLL[[**
(#KLLLLLLLLr3   c                    t          |          }|                    t          di                      |                    d           \  }|t	          d          k    sJ d S rl   )r   rE   r   r8   r
   ri   s       r)   Dbut_it_derives_the_element_type_from_the_text_when_none_is_specifiedz`Describe_ElementAccumulator.but_it_derives_the_element_type_from_the_text_when_none_is_specified   sa     $L11		+FKKLLL[[&&
-(PQQQQQQQQr3   c                    t          |          }|                    t          di                      |                    d           \  }|t	          d          k    sJ d S )Nz* turning into a penguinzturning into a penguinrm   ri   s       r)   @it_removes_an_explicit_leading_bullet_character_from_a_list_itemz\Describe_ElementAccumulator.it_removes_an_explicit_leading_bullet_character_from_a_list_item   sa     $L11		+8"==>>>[[&&
(#;<<<<<<<<r3   c                v   t          j        dt                                        d          d         }t	          |          }|                    t          di                      |                    t                    \  }|	                                }|
                    d           |ddiddd	k    sJ d S )
Nz<h3>About fish</h3>z.//h3r   zThanks for all those!
element_idcategory_depth   r   metadatar,   type)r   
fromstringr   xpathr   rE   r   r8   r   to_dictpopr<   rQ   r=   r[   es        r)   "it_applies_category_depth_metadataz>Describe_ElementAccumulator.it_applies_category_depth_metadata   s    '(={KKQQRYZZ[\]#L11		+5r::;;;[[''
OO	l)1-+
 
 
 
 
 
 
 
r3   c                   t          |          }|                    t          dddd                     |                    t          di                      |                    t          dddd                     |                    t          di                      |                    t                    \  }|                                }|                    d	           |ddgddgdd
ddk    sJ d S )Nz
    Ford...Fordr   r    z you're turning into a penguiniz.
rt   rZ   r
   rw   )r   rE   r   r8   r
   r|   r}   r~   s        r)   -and_it_consolidates_annotations_into_metadatazIDescribe_ElementAccumulator.and_it_consolidates_annotations_into_metadata   sC   #L11		06,/  	
 	
 	
 			+7<<===		09,/  	
 	
 	
 			+eR(()))[[//
OO	l -
 )	 	 =#
 
 
 
 
 
 
 
r3   )	html_texttag
ElementClsr-   z5<p>Ford... you're turning into a penguin. Stop it.<p>pNz!<p>* thanks for all the fish.</p>r   z!<li>thanks for all the fish.</li>liz><ul><li>So long</li><li>and thanks for all the fish.</li></ul>   z><dl><dd>So long<ol><li>and thanks for the fish.</li></ol></ul>rv   z<p>Examples</p>z<h1>Examples</h1>h1z<h2>Examples</h2>h2z<h3>Examples</h3>h3z<h4>Examples</h4>h4   z<h5>Examples</h5>h5   z<h6>Examples</h6>h6   r   r/   r   r   type[Element]r-   
int | Nonec                    t          j        |t                                        d|           d         }t	          |          }|                    |          |k    sJ d S )Nz.//r   )r   rz   r   r{   r   _category_depth)r<   r   r   r   r-   r   r=   s          r)   &it_computes_the_category_depth_to_helpzBDescribe_ElementAccumulator.it_computes_the_category_depth_to_help  sa    ( Y44::;;;GGJ#A&&$$Z00NBBBBBBr3   c                    t          |          }|                    t          di                      |                    t          di                      |j        dk    sJ d S rh   )r   rE   r   _normalized_textra   s      r)   <it_computes_the_normalized_text_of_its_text_segments_to_helpzXDescribe_ElementAccumulator.it_computes_the_normalized_text_of_its_text_segments_to_help5  sf     $L11		+BBGGHHH		+3R88999%)QQQQQQQr3   returnc                h    t          j        dt                                        d          d         S )N<p/>.//pr   )r   rz   r   r{   )r<   s    r)   rQ   z(Describe_ElementAccumulator.html_element@  s(    44::6BB1EEr3   )rQ   rR   )r   r/   r   r/   r   r   r-   r   )r   rR   )rJ   rK   rL   rM   r?   rG   r]   rb   rf   rj   rn   rp   rr   r   r   r%   markparametrizer   r	   r   r   r   fixturerQ   rN   r3   r)   rP   rP      s       ``         $ $ $ $$ $ $ $H H H HM M M MR R R R= = = =
 
 
(
 (
 (
 (
X [<Dc4QUV0#xC0$!DMtU]_`aMtU]_`aUA. $q1 $q1 $q1 $q1 $q1 $q1	
 "C C C# "CR R R R V^F F F F F Fr3   rP   c                      e Zd ZdZd ZdS )Describe_PreElementAccumulatorzYIsolated unit-test suite for `unstructured.partition.html.parser._PreElementAccumulator`.c                   t          j        dt                                        d          d         }t	          |          }|                    t          di                      |                    t          di                      |                    t          di                      |                    t          di                      |j        dk    sJ d S )	Nr   r   r   z

z    The panel lit up
z(    with the words 'Please do not press
z    this button again'

zU
    The panel lit up
    with the words 'Please do not press
    this button again'
)r   rz   r   r{   r   rE   r   r   ra   s      r)   r   z[Describe_PreElementAccumulator.it_computes_the_normalized_text_of_its_text_segments_to_helpH  s    '<<BB6JJ1M&|44		+fb))***		+6;;<<<		+I2NNOOO		+:B??@@@ %'
 
 
 
 
 
r3   N)rJ   rK   rL   rM   r   rN   r3   r)   r   r   E  s)        cc
 
 
 
 
r3   r   c                      e Zd ZdZd Zd Zd Zd Zd Zd Z	d Z
ej                            d	d
di fgfddi fddddfdi fddddfdi fgfddi fddddfddddfdi fgfg          d!d            Zd S )"DescribeFlowzIsolated unit-test suite for `unstructured.partition.html.parser.Flow`.

    The `Flow` class provides most behaviors for flow (block-level) elements.
    c                    t          j        dt                                        d          d         }t	          |t
                    sJ |j        du sJ d S )Nz<p>Hello</p>r   r   F)r   rz   r   r{   
isinstancer   is_phrasing)r<   r   s     r)   %it_knows_it_is_NOT_a_phrasing_elementz2DescribeFlow.it_knows_it_is_NOT_a_phrasing_elementd  sV    ^[99??GGJ!T"""""}%%%%%%r3   c                   d}t          j        |t                                        d          d         }|                                }t          |          }|t          d          k    sJ |j                                        dg dg ddk    sJ t          |          }|t          d          k    sJ |j                                        d	gd
gdk    sJ t          |          }|t          d          k    sJ |j                                        dg dg ddk    sJ t          |          }|t          d          k    sJ |j                                        ddik    sJ t          |          }|t          d          k    sJ |j                                        dddgddgdk    sJ t          j        t                    5  t          |          }ddd           dS # 1 swxY w Y   dS )a?  Phrasing siblings of child block elements are processed with text or tail.

        In the general case, a Flow element can contain text, phrasing content, and child flow
        elements.

        Each of these five lines in this example is a "paragraph" and gives rise to a distinct
        document-element.
        a  
          <div>
            Text of div <b>with <i>hierarchical</i>
phrasing</b> content before first block item
            <p>Click <a href="http://blurb.io">here</a> to see the blurb for this block item. </p>
            tail of block item <b>with <i>hierarchical</i> phrasing </b> content
            <p>second block item</p>
            tail of block item <b>with <i>  hierarchical  </i></b> phrasing content
          </div>
        .//divr   zFText of div with hierarchical phrasing content before first block item)withhierarchicalphrasingr   r   r   )ru   r   r   z0Click here to see the blurb for this block item.herezhttp://blurb.ior   	link_urlsz5tail of block item with hierarchical phrasing contentzsecond block itemru   r   r   r   r   N)r   rz   r   r{   iter_elementsr:   r   rx   r|   r
   r%   r&   r9   r<   r   divelementsr   s        r)   8it_generates_the_document_elements_from_the_Flow_elementzEDescribeFlow.it_generates_the_document_elements_from_the_Flow_elementl  s   	 y+66<<XFFqI$$&&NNEbccccccz!!##(L(L(L$4$4$4(
 (
 
 
 
 

 NNM"TUUUUUUz!!##vhN_M`'a'aaaaaNNEQRRRRRRz!!##(L(L(L$4$4$4(
 (
 
 
 
 

 NNE-......z!!##(8!'<<<<<NNEQRRRRRRz!!##)/(@%($K(
 (
 
 
 
 

 ]=)) 	 	XA	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   G..G25G2c                `   d}t          j        |t                                        d          d         }|                    |j        t          |          t                    }t          |          }|t          d          k    sJ |j	        
                                g dg ddk    sJ dS )	zJText and tails and their phrasing content are both processed the same way.zK<div>The 
 Roman <b>poet <i>   Virgil</i> gave</b> his <q>pet</q> fly</div>r   r   z&The Roman poet Virgil gave his pet fly)poetVirgilgaver   r    N)r   rz   r   r{   _element_from_text_or_tailr,   r   r   r:   rx   r|   r   s        r)   4it_assembles_text_and_tail_document_elements_to_helpzADescribeFlow.it_assembles_text_and_tail_document_elements_to_help  s    b	y+66<<XFFqI11#(E#JJMMNNDABBBBBBz!!##(B(B(B$4$4$4(
 (
 
 
 
 
 
 
r3   c                N   d}t          j        |t                                        d          d         }|                    |j        t          |          t                    }t          j	        t                    5  t          |           d d d            d S # 1 swxY w Y   d S )Nz6<div>   <b> 
 <i>  
 </i>  </b>   <q> 
 </q> 
  </div>r   r   )r   rz   r   r{   r   r,   r   r   r%   r&   r9   r:   r<   r   r   r   s       r)   Mbut_it_does_not_generate_a_document_element_when_only_whitespace_is_containedzZDescribeFlow.but_it_does_not_generate_a_document_element_when_only_whitespace_is_contained  s    P	y+66<<XFFqI11#(E#JJMM]=)) 	 	NNN	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   =BB!Bc                   d}t          j        |t                                        d          d         }|                    |j        t          |          t                    }t          |          }|t          d          k    sJ |j	        
                                i k    sJ t          j        t                    5  t          |           d d d            d S # 1 swxY w Y   d S )Nz;<div>
  The line-storm clouds fly tattered and swift
</div>r   r   z,The line-storm clouds fly tattered and swift)r   rz   r   r{   r   r,   r   r   r:   rx   r|   r%   r&   r9   r   s        r)   @it_uses_the_specified_element_class_to_form_the_document_elementzMDescribeFlow.it_uses_the_specified_element_class_to_form_the_document_element  s   S	y+66<<XFFqI11#(E#JJPPNNGJKKKKKKz!!##r))))]=)) 	 	NNN	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s    CC!$C!c                   d}t          j        |t                                        d          d         }|                    |j        t          |                    }t          |          t          d          k    sJ d S )Nz<<div>
  The line-storm clouds fly tattered and swift,
</div>r   r   z-The line-storm clouds fly tattered and swift,)	r   rz   r   r{   r   r,   r   r:   r
   r   s       r)   Rand_it_selects_the_document_element_class_by_analyzing_the_text_when_not_specifiedz_DescribeFlow.and_it_selects_the_document_element_class_by_analyzing_the_text_when_not_specified  so    T	y+66<<XFFqI11#(E#JJGGH~~/^!_!_______r3   c                B   d}t          j        |t                                        d          d         }|                    |j        t          |                    }t          j        t                    5  t          |           d d d            d S # 1 swxY w Y   d S )Nz<div> * </div>r   r   )r   rz   r   r{   r   r,   r   r%   r&   r9   r:   r   s       r)   Ubut_it_does_not_generate_a_document_element_when_only_a_bullet_character_is_containedzbDescribeFlow.but_it_does_not_generate_a_document_element_when_only_a_bullet_character_is_contained  s    $	y+66<<XFFqI11#(E#JJGG]=)) 	 	NNN	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   7BBBr   r-   z,<p>Ford... you're turning into a penguin.<p>rZ   z:<p>Ford... <b>you're turning</b> into
a <i>penguin</i>.<p>zFord... zyou're turningr   r    z into
a r   r   .z:<p>Ford... <b>you're <i>turning</i></b> into a penguin.<p>zyou're zyou'returningr   z into a penguin.r   r/   r-   list[Annotation]c                    t          j        |t                                        d          d         }t	          |                    |j        t          |                              }||k    sJ d S )Nr   r   )r   rz   r   r{   list_iter_text_segmentsr,   r   )r<   r   r-   r   text_segmentss        r)   Eit_recursively_generates_text_segments_from_text_and_phrasing_to_helpzRDescribeFlow.it_recursively_generates_text_segments_from_text_and_phrasing_to_help  sf    V Y44::6BB1EQ2216588DDEE......r3   N)r   r/   r-   r   )rJ   rK   rL   rM   r   r   r   r   r   r   r   r%   r   r   r   rN   r3   r)   r   r   \  s        & & &2 2 2l
 
 
   
 
 
` ` `   [' ?:B?@
 N$(5E_bcc !"%!5>X[\\ "I" M$!5=WZ[[
 "5>X\]] (,+$	
' 'P/ / /Q' 'P/ / /r3   r   c                  l    e Zd ZdZd Zej                            dg d          dd            Zd	 Z	d
S )DescribePrezIsolated unit-test suite for `unstructured.partition.html.parser.Pre`.

    The `Pre` class specializes behaviors for the `<pre>` (pre-formatted text) element.
    c                b   d}t          j        |t                                        d          d         }|                                }t          |          }|t          d          k    sJ t          j        t                    5  t          |           ddd           dS # 1 swxY w Y   dS )z4A `<pre>` element can contain only phrasing content.z<pre>
  The Answer to the Great Question...   Of Life, the Universe and Everything...
  Is... Forty-two, said Deep Thought, with infinite majesty and calm.
</pre>
.//prer   z  The Answer to the Great Question...   Of Life, the Universe and Everything...
  Is... Forty-two, said Deep Thought, with infinite majesty and calm.N)
r   rz   r   r{   r   r:   r   r%   r&   r9   )r<   r   prer   r   s        r)   9it_preserves_the_whitespace_of_its_phrasing_only_contentszEDescribePre.it_preserves_the_whitespace_of_its_phrasing_only_contents  s     	 y+66<<XFFqI$$&&NNDT
 
 
 
 
 
 ]=)) 	 	NNN	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   B$$B(+B(r   ))z<pre>
  foo  </pre>  foo  )z<pre> 
  foo  </pre>z	 
  foo  )z<pre>

  foo  </pre>z
  foo  )z<pre>  foo  
</pre>r   )z<pre>  foo  
 </pre>z	  foo  
 )z<pre>  foo  

</pre>z  foo  
)z<pre>
  foo  
</pre>r   )z<pre> 
  foo  
 </pre>z 
  foo  
 r   r/   r-   c                    t          j        |t                                        d          d         }t	          |                                          }|j        |k    sJ dS )zSContent starts on next line when opening `<pre>` tag is immediately followed by `
`r   r   N)r   rz   r   r{   r:   r   r,   )r<   r   r-   r   r   s        r)   2but_it_strips_a_single_leading_or_trailing_newlinez>DescribePre.but_it_strips_a_single_leading_or_trailing_newline%  s[    2 y+66<<XFFqI""$$%%v''''''r3   c                d   d}t          j        |t                                        d          d         }t	          |                                          }|j        dk    sJ |j        j        dgk    sJ |j        j	        dgk    sJ |j        j
        dgk    sJ |j        j        dgk    sJ d S )	NzL<pre>You're <b>turning</b> into a <a href="http://eie.io">penguin</a>.</pre>r   r   zYou're turning into a penguin.r   r   r   http://eie.io)r   rz   r   r{   r:   r   r,   rx   r   r   r   r   )r<   r   r   r   s       r)   Pit_assigns_emphasis_and_link_metadata_when_contents_have_those_phrasing_elementsz\DescribePre.it_assigns_emphasis_and_link_metadata_when_contents_have_those_phrasing_elementsC  s    c	y+66<<XFFqI""$$%%v99999z2ykAAAAz.3%7777z$3333z#'8888888r3   N)r   r/   r-   r/   )
rJ   rK   rL   rM   r   r%   r   r   r   r   rN   r3   r)   r   r     s         
  ( ['	
 	
 	
 *( ( (+ *(
9 
9 
9 
9 
9r3   r   c                      e Zd ZdZd ZdS )DescribeRemovedBlockzIsolated unit-test suite for `unstructured.partition.html.parser.RemovedBlock`.

    This class is used for block level items we want to skip like `<hr/>` and `<figure>`.
    c                    d}t          j        |t                                        d          d         }t	          |                                          t          d          gk    sJ d S )Na&  
          <div>
            <hr/>
            <figure>
              <img src="/media/cc0-images/elephant-660-480.jpg" alt="Elephant at sunset" />
              <figcaption>An elephant at sunset</figcaption>
            </figure>
            <p>Content we want.</p>
          </div>
          r   r   zContent we want.)r   rz   r   r{   r   r   r
   )r<   r   r   s      r)   it_is_skipped_during_parsingz1DescribeRemovedBlock.it_is_skipped_during_parsingV  sg    		 y+66<<XFFqIC%%''((];M-N-N,OOOOOOOr3   N)rJ   rK   rL   rM   r   rN   r3   r)   r   r   P  s2         
P P P P Pr3   r   c                  r   e Zd ZdZd Zej                            ddg fddi fgfddi fgfd	d
i fdi fgfddi fd
i fdi fdi fdi fgfg          dDd            Zej                            dd e	d          gfd e
dddd           e	d          gfd e	d           e
dddd          gfd e
dddd           e	d           e
dddd          gfg          dEd            Zd Zej                            d d!d"g          dFd#            Zd$ Zej                            d%g d&          dGd'            Zej                            dd(g fd) e
di           gfd* e
di            e
di           gfd+ e
di            e
di            e
di            e
d,i            e
d-i            e
d.i           gfg          dDd/            Zej                            d0d1d!g fd2d! e	d          gfd3d! e	d           e
di           gfd4d! e	d           e
di            e
di            e
d,i           gfd5d e
dddd           e	d           e
dddd           e
d,d,dd           e
d-d-dd          gfg          dHd7            Zej                            d8d9d!g fd:d! e
di           gfd;d! e
di           gfd<d! e
di            e
di           gfd=d! e
di            e
di            e
di            e
d,i            e
d-i           gfd>d e
dddd           e
dddd           e
ddd?d           e
d,d,dd           e
d-d-dd          gfd@d! e
di            e
di            e	d           e
d,i            e
d-i           gfg          dIdB            ZdCS )JDescribePhrasingzIsolated unit-test suite for `unstructured.partition.html.parser.Phrasing`.

    The `Phrasing` class provides most behaviors for phrasing (inline) elements.
    c                    t          j        dt                                        d          d         }t	          |t
                    sJ |j        du sJ d S )Nz<b>Hello</b>.//br   T)r   rz   r   r{   r   r   r   )r<   r   s     r)   !it_knows_it_is_a_phrasing_elementz2DescribePhrasing.it_knows_it_is_a_phrasing_elementq  sV    ^[99??GGJ!X&&&&&}$$$$$$r3   r   z<code></code>z<data> foo </data>z foo z<dfn/> bar z bar z.<kbd><mark>foo <meter>bar</meter></mark></kbd>zfoo barz4<kbd> <mark>foo <meter>bar</meter> baz</mark> </kbd> z bazr   r/   r-   list[TextSegment]c                    t          j        |t                                        d          d         d         }t	          |                                          |k    sJ d S N.//bodyr   r   rz   r   r{   r   iter_text_segmentsr<   r   r-   r   s       r)   =it_generates_text_segments_for_its_text_and_children_and_tailzNDescribePhrasing.it_generates_text_segments_for_its_text_and_children_and_taily  sY    6 Y44::9EEaHKA((**++~======r3   z<strong><p>aaa</p></strong>aaaz<strong>aaa<p>bbb</p></strong>r   r    bbbz<strong><p>aaa</p>bbb</strong>z!<strong>aaa<p>bbb</p>ccc</strong>ccclist[TextSegment | Element]c                    t          j        |t                                        d          d         d         }t	          |                                          |k    sJ d S r   r   r   s       r)   Fbut_it_can_also_generate_an_element_when_it_has_a_nested_block_elementzWDescribePhrasing.but_it_can_also_generate_an_element_when_it_has_a_nested_block_element  sZ    V Y44::9EEaHKA((**++~======r3   c                    t          j        dt                                        d          d         }|                    dd          dddk    sJ d S )N<cite/>.//citer   z
  foobar
  r   r"   r    r   rz   r   r{   _annotationr<   cites     r)   &it_forms_its_annotations_from_emphasisz7DescribePhrasing.it_forms_its_annotations_from_emphasis  si    	;77==iHHK 0$77(0$(<
 <
 
 
 
 
 
 
r3   r,    z
  	  c                    t          j        dt                                        d          d         }|                    |d          i k    sJ d S )Nr   r   r   r   r   )r<   r,   r   s      r)   (but_not_when_text_is_empty_or_whitespacez9DescribePhrasing.but_not_when_text_is_empty_or_whitespace  sN    	;77==iHHKd++r111111r3   c                    t          j        dt                                        d          d         }|                    dd          i k    sJ d S )Nr   r   r   r"   r   r   r   s     r)   !and_not_when_there_is_no_emphasisz2DescribePhrasing.and_not_when_there_is_no_emphasis  sN    	;77==iHHK"--333333r3   enclosing_emphasis)r   r   r   c                    t          j        dt                                        d          d         }|                    |          |k    sJ dS )zInside emphasis is applied to text inside the phrasing element (but not its tail).

        The `._inside_emphasis()` method is overridden by Bold and Italic classes which add their
        specific emphasis characters.
        z<abbr/>z.//abbrr   N)r   rz   r   r{   _inside_emphasis)r<   r  abbrs      r)   =it_uses_the_enclosing_emphasis_as_the_default_inside_emphasiszNDescribePhrasing.it_uses_the_enclosing_emphasis_as_the_default_inside_emphasis  sP     	;77==iHHK$$%788<NNNNNNNr3   z<abbr>aaa</abbr>z<bdi>x<bdo>bbb</bdo></bdi>z<bdi>x<bdo>bbb</bdo>ccc</bdi>zG<big>xxx<cite>aaa<code>bbb<data>ccc</data>ddd</code>eee</cite>fff</big>dddeeefffc                    t          j        |t                                        d          d         d         }t	          |                    d                    |k    sJ d S )Nr   r   r   r   rz   r   r{   r   _iter_child_text_segmentsr   s       r)   ;it_generates_text_segments_for_its_children_and_their_tailszLDescribePhrasing.it_generates_text_segments_for_its_children_and_their_tails  s[    6 Y44::9EEaHKA//3344FFFFFFr3   )r   inside_emphasisr-   z<dfn></dfn>z<kbd><p>aaa</p></kbd>z<kbd><p>aaa</p>bbb</kbd>z+<kbd><p>aaa</p>bbb<mark>ccc</mark>ddd</kbd>z5<strong><q>aaa</q><p>bbb</p>ccc<s>ddd</s>eee</strong>r  c                    t          j        |t                                        d          d         d         }t	          |                    |                    |k    sJ d S r   r  )r<   r   r  r-   r   s        r)   0and_it_generates_elements_for_its_block_childrenzADescribePhrasing.and_it_generates_elements_for_its_block_children  s\    Z Y44::9EEaHKA//@@AA^SSSSSSr3   r   emphasisr-   z<cite><p/></cite>z<cite><p/>aaa</cite>z<cite><p/><s>aaa</s></cite>z<bdi><p/><s>aaa</s>bbb</bdi>z,<sub><p/>aaa<s>bbb<q>ccc</q>ddd</s>eee</sub>z2<strong><p/>aaa<s>bbb<i>ccc</i>ddd</s>eee</strong>r   z4<cite><p/>aaa<abbr>bbb<p>ccc</p>ddd</abbr>eee</cite>r  c                B   t          j        |t                                        d          d         d         }|                    d          d         }|j        pd}t          |dd                    }t          |                    |||                    |k    sJ d S )Nr   r   z./pr   r   )r   rz   r   r{   tailr   r   0_iter_text_segments_from_block_tail_and_phrasing)r<   r   r  r-   r   r   r  qs           r)   @it_generates_text_segments_from_the_tail_and_contiguous_phrasingzQDescribePhrasing.it_generates_text_segments_from_the_tail_and_contiguous_phrasing7  s    | Y44::9EEaHKGGENN1v|!ABB%LL CCD!XVVWW     r3   N)r   r/   r-   r   )r   r/   r-   r   r,   r/   )r  r/   )r   r/   r  r/   r-   r   )r   r/   r  r/   r-   r   )rJ   rK   rL   rM   r   r%   r   r   r   r   r   r   r   r  r  r	  r  r  r  rN   r3   r)   r   r   i  s        % % % [' b!!WbM?3gr]O,=uVXk?Z[ G"IRLBKRL"I		
 0> > >1 0> [' +UU5\\N; 1KE[^__  E%LL	 1E%LLKE[^__  4KE[^__  E%LLKE[^__ 1$	
' 'P> > >Q' 'P>
 
 
 [Vb*%5662 2 2 7624 4 4 [1???CC	O 	O 	O DC	O ['  $)KKr,B,B+CD,{{5"/E/E{{SXZ\G]G].^_ ZKr**Kr**Kr**Kr**Kr**Kr**
	
 0G G G1 0G [: B#$b55<<.9'eeEllKKr<R<R-ST >E%LLKr**Kr**Kr**		 HKE[^__  E%LLKE[^__   KE[^__   KE[^__ '&	
) )TT T TU) )TT [3 !"b)#R++eR*@*@)AB*BUB1G1G0HI+R++eR2H2H++V[]_J`J`1ab ?Kr**Kr**Kr**Kr**Kr**
 EKE[^__   KE[^__   KE[_``   KE[^__   KE[^__ . GKr**Kr**E%LLKr**Kr**
Y7	
: :v
 
 
w: :v
 
 
r3   r   c                  @   e Zd ZdZej                            dddg fdd edi           gfdd edi           gfd	d ed
dgdgd          gfdd ed
dgdgd           edi           gfdd eddgdgdgdgd           edi           gfdd eddgdgdgdgd           edddd          gfg          d6d"            Zd# Z	d$ Z
d% Zd& Zd' Zd( Zd) Zd* Zd+ Zd, Zd- Zd. Zd/ Zd0 Zej                            d1dd2g          d7d3            Zd4 Zd5S )8DescribeAnchorzIsolated unit-test suite for `unstructured.partition.html.parser.Anchor`.

    The `Anchor` class is used for `<a>` tags and provides link metadata.
    r  z<a href="http://abc.com"></a>r   z(<a href="http://abc.com"></a> long tail z long tail z<a href="http://abc.com">  </a>z  z)<a href="http://abc.com"> click here </a>z click here z
click herehttp://abc.comr   z3<a href="http://abc.com"> click here </a> long tailz
 long tailzI<p>I am <a href="http://eie.io">one <u>with<i> the</i></u> Force</a>.</p>zone with the Forcether   r   r   r   r   r   r   zL<p>I am <strong><a href="http://eie.io">one with</a> the Force.</strong></p>r   one with the Force.
the Force.r    r   r/   r  r-   r   c                    t          j        |t                                        d          d         }t	          |                    |                    |k    sJ d S )N.//ar   r   )r<   r   r  r-   as        r)   Nit_generates_link_annotated_text_segments_for_its_text_and_a_tail_text_segmentz]DescribeAnchor.it_generates_link_annotated_text_segments_for_its_text_and_a_tail_text_segment  sW    \ Y44::6BB1EA((2233~EEEEEEr3   c           	     0   d}t          j        |t                                        d          d         }t	          |                    d                    t          ddgdgdgdgd          t          d	          t          d
ddd          gk    sJ d S )Nz;<a href="http://eie.io">I am <p>one with</p> the Force.</a>r%  r   r   zI am zI amr   r   r!  r"  r#  r    )r   rz   r   r{   r   r   r   r   r<   r   r&  s      r)   6it_generates_enclosed_block_items_as_separate_elementszEDescribeAnchor.it_generates_enclosed_block_items_as_separate_elements  s    U	Y44::6BB1EA((--..17-0E#)("1!2	   *0<,/  3
 
 
 
 
 
 
r3   c           	     ~   d}t          j        |t                                        d          d         }t	          |                    d                    }|t          di           t          d          t          dddd	          gk    sJ |d
         }|j        j	        dgk    sJ |j        j
        dgk    sJ d S )Nz><a href="http://eie.io"> 
 <p>I am one with</p> the Force.</a>r%  r   r   z 
 zI am one withr"  r#  r    r   r   )r   rz   r   r{   r   r   r   r
   rx   r   r   )r<   r   r&  actualr[   s        r)   Yand_it_annotates_first_enclosed_block_Element_when_no_non_whitespace_phrase_appears_firstzhDescribeAnchor.and_it_annotates_first_enclosed_block_Element_when_no_non_whitespace_phrase_appears_first  s     Z	Y44::6BB1Ea**3//00##/**0<,/  

 

 

 

 

 )*.?????)o->>>>>>>r3   c                   d}t          j        |t                                        d          d         }t	          |                    d                    t          di           ft          d          t          di           fgk    sJ d S )	Nzi
          <a href="http://eie.io">But always <p>see first.</p> Otherwise you </a> will only see
        r%  r   r   )r  But always 
see first.z Otherwise you )r   rz   r   r{   r   _iter_phrases_and_elementsr   r
   r)  s      r)   Eit_divides_the_anchor_contents_but_not_tail_into_phrases_and_elementszTDescribeAnchor.it_divides_the_anchor_contents_but_not_tail_into_phrases_and_elements
  s    	 Y44::6BB1EA00"0==>>++-,''*B//1C
 
 
 
 
 
 
r3   c           	     8   d}t          j        |t                                        d          d         }t	          j        t                    5  t          |                    dt          g           d                     d d d            d S # 1 swxY w Y   d S )N<a href="http://eie.io"></a>r%  r   r   r,   r  r  )
r   rz   r   r{   r%   r&   r9   r:   _iter_phrasingr   r)  s      r)   6it_generates_zero_items_when_both_text_and_q_are_emptyzEDescribeAnchor.it_generates_zero_items_when_both_text_and_q_are_empty  s    6	Y44::6BB1E]=)) 	F 	F!!rU2YY!DDEEE	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	Fs   3BBBc                
   d}t          j        |t                                        d          d         }t	          |                    |j        t          |          d                    t          di           fgk    sJ d S )Nz5<a href="http://eie.io">
  But always see first.
</a>r%  r   r   r5  z
  But always see first.
	r   rz   r   r{   r   r6  r,   r   r   r)  s      r)   /it_generates_a_phrase_when_only_text_is_presentz>DescribeAnchor.it_generates_a_phrase_when_only_text_is_present  s    Q	Y44::6BB1EA$$!&E!HHr$JJKK6;;=P
 
 
 
 
 
 
r3   c           	     p   d}t          j        |t                                        d          d         }t	          |                    |j        t          |          d                    t          di           t          ddd	d
          t          dddd
          t          di           fgk    sJ d S )NzI<a href="http://eie.io">But always <b>see <i>first</i></b>. Otherwise</a>r%  r   r   r5  r/  zsee seer   r    firstr   z. Otherwiser9  r)  s      r)   Jand_it_generates_a_phrase_when_that_text_is_followed_by_a_phrasing_elementzYDescribeAnchor.and_it_generates_a_phrase_when_that_text_is_followed_by_a_phrasing_element'  s    c	Y44::6BB1EA$$!&E!HHr$JJKKM2..4903   4;04   M2..!P
 
 
 
 
 
 
r3   c                
   d}t          j        |t                                        d          d         }t	          |                    |j        t          |          d                    t          di           fgk    sJ d S )NzE<a href="http://eie.io">But always see first.</a> Otherwise you will r%  r   r   r5  zBut always see first.r9  r)  s      r)   ,it_ends_the_phrase_at_the_end_of_the_elementz;DescribeAnchor.it_ends_the_phrase_at_the_end_of_the_element@  s    _	Y44::6BB1EA$$!&E!HHr$JJKK0"557P
 
 
 
 
 
 
r3   c                
   d}t          j        |t                                        d          d         }t	          |                    |j        t          |          d                    t          di           fgk    sJ d S )NzH<a href="http://eie.io">But always see first. <p>Otherwise you </p> </a>r%  r   r   r5  zBut always see first. r9  r)  s      r)   2but_it_ends_at_a_block_element_if_one_occurs_firstzADescribeAnchor.but_it_ends_at_a_block_element_if_one_occurs_firstH  s    b	Y44::6BB1EA$$!&E!HHr$JJKK12668P
 
 
 
 
 
 
r3   c           	        d}t          j        |t                                        d          d         }t	          |                    |j        t          |          d                    t          di           t          ddd	d
          ft          d          t          ddd	d
          t          di           fgk    sJ d S )Nzk
          <a href="http://eie.io">But <strong>always <p>see first.</p>Otherwise</strong> you </a>
        r%  r   r   r5  zBut zalways alwaysr   r    r0  	Otherwisez you )
r   rz   r   r{   r   r6  r,   r   r   r
   r)  s      r)   ?it_generates_an_element_for_a_block_item_nested_inside_phrasingzNDescribeAnchor.it_generates_an_element_for_a_block_item_nested_inside_phrasingP  s    	 Y44::6BB1EA$$!&E!HHr$JJKKFB''4<03  	 ,''4?03   GR((	P
 
 
 
 
 
 
r3   c                   d}t          j        |t                                        d          d         }t	          d          }|                    |          }||u sJ |j        j        dgk    sJ |j        j        dgk    sJ d S )Nr4  r%  r   r   r   	r   rz   r   r{   r   _link_annotate_elementrx   r   r   r<   r   r&  r[   r   s        r)   +it_adds_link_metadata_to_an_element_to_helpz:DescribeAnchor.it_adds_link_metadata_to_an_element_to_helpp  s    6	Y44::6BB1Eu++$$W--G||||z$////z#'8888888r3   c                H   d}t          j        |t                                        d          d         }t	          d          }dg|j        _        dg|j        _        |                    |          }||u sJ |j        j        ddgk    sJ |j        j        ddgk    sJ d S )Nr4  r%  r   r   abcr  r   )	r   rz   r   r{   r   rx   r   r   rI  rJ  s        r)   :and_it_preserves_any_existing_link_metadata_on_the_elementzIDescribeAnchor.and_it_preserves_any_existing_link_metadata_on_the_element{  s    6	Y44::6BB1Eu++',g#&6%7"$$W--G||||z$6666z#(8/'JJJJJJJr3   c                    d}t          j        |t                                        d          d         }t	          d          }|                    |          }||u sJ |j        j        J |j        j        J d S )Nz<a href="http://eie.io"/>r%  r   r   rH  rJ  s        r)   but_not_when_the_text_is_emptyz-DescribeAnchor.but_not_when_the_text_is_empty  s    3	Y44::6BB1Er(($$W--G||||z$,,,z#+++++r3   c                    d}t          j        |t                                        d          d         }t	          d          }|                    |          }||u sJ |j        j        J |j        j        J d S )Nz<a/>r%  r   zzzrH  rJ  s        r)   and_not_when_there_is_no_urlz+DescribeAnchor.and_not_when_there_is_no_url  s    	Y44::6BB1Eu++$$W--G||||z$,,,z#+++++r3   c           	     ,   d}t          j        |t                                        d          d         }t	          ddgdgd          t	          dd	d
d          f}|                    |          }|t	          ddd	gdd
gdgdgd          k    sJ d S )Nr4  r%  r   zOtherwise you will only rE  r   r    zsee what you were expecting.
	expectingr   z5Otherwise you will only see what you were expecting.
z4Otherwise you will only see what you were expecting.r   r   r   rz   r   r{   r   _link_text_segment)r<   r   r&  rF   link_text_segments        r)   Iit_consolidates_a_phrase_into_a_single_link_annotated_TextSegment_to_helpzXDescribeAnchor.it_consolidates_a_phrase_into_a_single_link_annotated_TextSegment_to_help  s    6	Y44::6BB1E*1<-0E   00;,/  
" 0088 KD-8+,F),c
UV-.	 %
 %
 
 
 
 
 
 
r3   r,   z 
 	 c                    d}t          j        |t                                        d          d         }t	          |i           t	          |i           t	          |i           f}|                    |          J d S )Nr4  r%  r   rV  )r<   r,   r   r&  rF   s        r)   1but_not_when_the_text_is_empty_or_whitespace_onlyz@DescribeAnchor.but_not_when_the_text_is_empty_or_whitespace_only  sw    6	Y44::6BB1EdB''T2)>)>DRT@U@UV##F++33333r3   c                    d}t          j        |t                                        d          d         }t	          di           t	          di           f}|                    |          J d S )Nz<a>foobar</a>r%  r   rE  z	 you willrV  )r<   r   r&  rF   s       r)   'and_not_when_the_anchor_has_no_href_urlz6DescribeAnchor.and_not_when_the_anchor_has_no_href_url  sj    '	Y44::6BB1Ek2..K0L0LM##F++33333r3   N)r   r/   r  r/   r-   r   r  )rJ   rK   rL   rM   r%   r   r   r   r'  r*  r-  r2  r7  r:  r>  r@  rB  rF  rK  rN  rP  rS  rY  r[  r]  rN   r3   r)   r  r    s         [3 -b"57kk-Y[>\>\=]^.[[r5J5J4KL <K&(4~EUDVWW 	 FK&(4~EUDVWW   Kb11
 \K,9>58E+?*@*9):	    KR(($ _K"9C58E+5,*9):	    K%8D47  cG	
J JVF F FWJ JVF
 
 
0? ? ?2

 

 

F F F
 
 

 
 
2
 
 

 
 

 
 
@	9 	9 	9K K K	, 	, 	,	, 	, 	,
 
 
@ [Vb)_554 4 4 6544 4 4 4 4r3   r  c                  $    e Zd ZdZd Zd Zd ZdS )DescribeBoldzIsolated unit-test suite for `unstructured.partition.html.parser.Bold`.

    The `Bold` class is used for `<b>` and `<strong>` tags and adds emphasis metadata.
    c                    t          j        dt                                        d          d         }|                                }t          |          \  }}|dk    sJ |dddk    sJ d S )Nz<b>rhombus</b>r   r   rhombusr   r    r   rz   r   r{   r   r:   r<   r   r   r,   
annotations        r)   0it_annotates_its_text_segment_with_bold_emphasisz=DescribeBold.it_annotates_its_text_segment_with_bold_emphasis      -{;;AA&II!L,,....jy    (1$'
 
 
 
 
 
 
 
r3   c                (   t          j        dt                                        d          d         }|                                }t          |          \  }}|dk    sJ |dddk    sJ t          |          \  }}|dk    sJ |dd	dk    sJ d S )
Nz<b>rhombus <i>pentagon</i></b>r   r   rhombus ra  r   r    pentagonr   rb  rc  s        r)   6and_its_children_are_also_annotated_with_bold_emphasiszCDescribeBold.and_its_children_are_also_annotated_with_bold_emphasis  s    ={KKQQRXYYZ[\,,....jz!!!!(1$'
 
 
 
 
 
  ..jz!!!!(2$(
 
 
 
 
 
 
 
r3   c                "   t          j        dt                                        d          d         }|                                }t          |          \  }}|dk    sJ |dddk    sJ t          |          \  }}|dk    sJ |i k    sJ d S )Nz<b>rhombus</b> pentagonr   r   ra  r   r    	 pentagonrb  rc  s        r)   but_not_its_tailzDescribeBold.but_not_its_tail      6DDJJ6RRSTU,,....jy    (1$'
 
 
 
 
 
  ..j{""""Rr3   N)rJ   rK   rL   rM   re  rj  rm  rN   r3   r)   r_  r_    K         


 

 


 
 
$         r3   r_  c                  $    e Zd ZdZd Zd Zd ZdS )DescribeItaliczIsolated unit-test suite for `unstructured.partition.html.parser.Italic`.

    The `Italic` class is used for `<i>` and `<em>` tags and adds emphasis metadata.
    c                    t          j        dt                                        d          d         }|                                }t          |          \  }}|dk    sJ |dddk    sJ d S )Nz<i>rhombus</i>.//ir   ra  r   r    rb  r<   r   r   r,   rd  s        r)   2it_annotates_its_text_segment_with_italic_emphasiszADescribeItalic.it_annotates_its_text_segment_with_italic_emphasis  rf  r3   c                (   t          j        dt                                        d          d         }|                                }t          |          \  }}|dk    sJ |dddk    sJ t          |          \  }}|dk    sJ |dd	dk    sJ d S )
Nz <em>rhombus <b>pentagon</b></em>z.//emr   rh  ra  r   r    ri  r   rb  )r<   emr   r,   rd  s        r)   8and_its_children_are_also_annotated_with_italic_emphasiszGDescribeItalic.and_its_children_are_also_annotated_with_italic_emphasis  s    @+NNTTU\]]^_`--//..jz!!!!(1$'
 
 
 
 
 
  ..jz!!!!(2$(
 
 
 
 
 
 
 
r3   c                "   t          j        dt                                        d          d         }|                                }t          |          \  }}|dk    sJ |dddk    sJ t          |          \  }}|dk    sJ |i k    sJ d S )Nz<i>rhombus</i> pentagonrs  r   ra  r   r    rl  rb  rt  s        r)   rm  zDescribeItalic.but_not_its_tail)  rn  r3   N)rJ   rK   rL   rM   ru  rx  rm  rN   r3   r)   rq  rq    ro  r3   rq  c                      e Zd ZdZd ZdS )DescribeLineBreaka:  Isolated unit-test suite for `unstructured.partition.html.parser.LineBreak`.

    Used for `<br/>` elements, it's only special behavior is to add whitespace such that phrasing
    butted up tight on both sides of the `<br/>` element is not joined, like `abc<br/>def` should
    become "abc def", not "abcdef".
    c                   t          j        dt                                        d          d         }|                                }d |D             }|g dk    sJ t          d                    |                    dk    sJ d S )Nz:<cite>spaceships of the<br/>Vogon Constructor Fleet</cite>r   r   c                    g | ]	}|j         
S rN   r,   .0tss     r)   
<listcomp>zDDescribeLineBreak.it_adds_a_newline_in_its_place.<locals>.<listcomp>H  s    111R111r3   )zspaceships of the
zVogon Constructor Fleetr   z)spaceships of the Vogon Constructor Fleet)r   rz   r   r{   r   r   join)r<   r   r   textss       r)   it_adds_a_newline_in_its_placez0DescribeLineBreak.it_adds_a_newline_in_its_placeA  s    H+
 

%	

1 //1111=111NNNNNNNrwwu~~..2]]]]]]]r3   N)rJ   rK   rL   rM   r  rN   r3   r)   r{  r{  9  s2         	^ 	^ 	^ 	^ 	^r3   r{  c                      e Zd ZdZd ZdS )DescribeRemovedPhrasingzIsolated unit-test suite for `unstructured.partition.html.parser.RemovedPhrasing`.

    Used for phrasing elements like `<label>` that we want to skip, including any content they
    enclose. The tail of such an element is not skipped though.
    c                   t          j        dt                                        d          d         }t	          |                                          \  }t          |t                    sJ |j        du sJ |j	        dk    sJ d S )Nzh<div>
  <label>Space<p>is big</p>, <b>mind-bogglingly</b> big.</label>
  Like vastly, hugely big.
</div>z.//labelr   Tz
  Like vastly, hugely big.
)
r   rz   r   r{   r   r   r   r   r   r,   )r<   labeltext_segments      r)    it_behaves_like_an_empty_elementz8DescribeRemovedPhrasing.it_behaves_like_an_empty_elementT  s      
 
 %


A u7799::%11111 D(((( $DDDDDDDr3   N)rJ   rK   rL   rM   r  rN   r3   r)   r  r  M  s2         E E E E Er3   r  c                  *    e Zd ZdZd Zd Zd Zd ZdS )DescribeDefaultElementa  Isolated unit-test suite for `unstructured.partition.html.parser.DefaultElement`.

    Used for any element we haven't assigned a custom element-class too. This prominently includes
    any non-HTML elements that can be embedded in the HTML.

    It identifies as a block item but it can behave as either a block-item or phrasing. Its behavior
    is a combination of RemovedBlock and RemovedPhrasing. Namely, it iterates zero elements and only
    iterates a text-segment for its tail.
    c                    t          j        dt                                        d          d         }t	          |t
                    sJ |j        du sJ d S )Nz<foobar>Vogon</foobar>	.//foobarr   T)r   rz   r   r{   r   r   r   )r<   r"   s     r)   #it_identifies_as_a_phrasing_elementz:DescribeDefaultElement.it_identifies_as_a_phrasing_elementt  sY    !":KHHNN{[[\]^&.11111!T))))))r3   c                   t          j        dt                                        d          d         }|                                }t          j        t                    5  t          |           ddd           dS # 1 swxY w Y   dS )z0Should never be called but belts and suspenders.z@<foobar>Space<p>is big</p>, <b>mind-bogglingly</b> big.</foobar>r  r   N)	r   rz   r   r{   r   r%   r&   r9   r:   )r<   r"   r   s      r)   *it_generates_zero_elements_as_a_block_itemzADescribeDefaultElement.it_generates_zero_elements_as_a_block_item|  s    !N
 
 %

Q 
 ''))]=)) 	 	NNN	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   !A>>BBc                    t          j        dt                                        d          d         }d |                                D             }|dgk    sJ d S )N<div>
  O Deep Thought computer, he said,
  <foobar>Vogon Constructor Fleet</foobar>
  The task we have designed you to perform is this.
  <p>We want you to tell us.... he paused,</p>
</div>r  r   c                    g | ]	}|j         
S rN   r~  r  s     r)   r  zuDescribeDefaultElement.it_generates_its_tail_but_no_inner_text_segments_when_called_like_phrasing.<locals>.<listcomp>  s    ???R???r3   z7
  The task we have designed you to perform is this.
  )r   rz   r   r{   r   )r<   r"   r  s      r)   Jit_generates_its_tail_but_no_inner_text_segments_when_called_like_phrasingzaDescribeDefaultElement.it_generates_its_tail_but_no_inner_text_segments_when_called_like_phrasing  sq    ! 
 
 %

Q  @?6#<#<#>#>???TUUUUUUUr3   c                    t          j        dt                                        d          d         }d |                                D             }|ddgk    sJ d S )Nr  r   r   c                    g | ]	}|j         
S rN   r~  )r  r   s     r)   r  zoDescribeDefaultElement.and_it_behaves_like_an_empty_phrasing_element_inside_a_block_element.<locals>.<listcomp>  s    555A555r3   zSO Deep Thought computer, he said, The task we have designed you to perform is this.z%We want you to tell us.... he paused,)r   rz   r   r{   r   )r<   r   r  s      r)   Dand_it_behaves_like_an_empty_phrasing_element_inside_a_block_elementz[DescribeDefaultElement.and_it_behaves_like_an_empty_phrasing_element_inside_a_block_element  s     
 
 %//! 65!2!2!4!4555a3
 
 
 
 
 
 
r3   N)rJ   rK   rL   rM   r  r  r  r  rN   r3   r)   r  r  g  s]         * * *
 
 
V V V
 
 
 
 
r3   r  )r,   r/   r-   r/   )-rM   
__future__r   collectionsr   r%   lxmlr   unstructured.documents.elementsr   r   r	   r
   r   r   "unstructured.partition.html.parserr   r   r   r   r   r   r   r   r   r   r   r   r*   r   r   r2   r5   rP   r   r   r   r   r   r  r_  rq  r{  r  r  rN   r3   r)   <module>r     s   B A " " " " " "              b b b b b b b b b b b b b b b b                           (E E EF    3 3 3 3% % % % % % % %PF F F F F F F FD
 
 
 
 
 
 
 
.l/ l/ l/ l/ l/ l/ l/ l/^B9 B9 B9 B9 B9 B9 B9 B9JP P P P P P P P2T
 T
 T
 T
 T
 T
 T
 T
nN4 N4 N4 N4 N4 N4 N4 N4b
1  1  1  1  1  1  1  1 h1  1  1  1  1  1  1  1 h^ ^ ^ ^ ^ ^ ^ ^(E E E E E E E E4B
 B
 B
 B
 B
 B
 B
 B
 B
 B
r3   