
    Ng                       d Z ddlmZ ddlZddlZddlmZ ddlZddlm	Z	 ddl
mZmZmZmZmZmZ ddlmZ ddlmZ dd	lmZmZmZmZmZmZmZmZ dd
lmZ ddl m!Z! ddl"m#Z#m$Z$ dndZ%dndZ&d Z'd Z(dodZ)d Z*ej+        ,                    dddg          dpd            Z-ej+        ,                    dg d          dpd            Z.ej+        ,                    dddg          dpd            Z/ej+        ,                    dg d          dpd            Z0ej+        ,                    dddg          dpd             Z1ej+        ,                    dg d          dpd!            Z2d" Z3d# Z4dod$Z5dod%Z6dod&Z7d' Z8d( Z9d) Z:d* Z;d+ Z<d, Z=d- Z>ej+        ,                    d.d/d0g          dqd3            Z?d4 Z@d5 ZAd6 ZBd7 ZCd8 ZDd9 ZEd: ZFd; ZGd< ZHd= ZId> ZJd? ZKd@ ZLdA ZMdB ZNdC ZOdD ZPdE ZQej+        ,                    d1g dF          drdG            ZRdH ZSdI ZTdJ ZUdK ZVdL ZWdM ZXdN ZYdO ZZdP Z[dQ Z\dR Z]dsdUZ^dV Z_dW Z`dX Zaej+        ,                    dYdZd[g          dtd^            Zbdod_Zcd` Zd edab          gZe G dc dd          Zfejg        dudg            Zhejg        dvdi            Zi G dj dk          Zj G dl dm          ZkdS )wz>Test suite for `unstructured.partition.html.partition` module.    )annotationsN)Any)etree)FixtureRequestMockassert_round_trips_through_JSONexample_doc_pathexample_doc_textfunction_mock)chunk_by_title)clean_extra_whitespace)AddressCompositeElementListItemNarrativeTextTable
TableChunkTextTitle)read_txt_filepartition_html)HtmlPartitionerOptions_HtmlPartitionertmp_pathpathlib.Pathc           
        t          | dz            }t          |d          5 }|                    d           d d d            n# 1 swxY w Y   t          |          }t	          |          dk    sJ |t          d          t          d          t          d          t          d          t          d	          t          d
          t          d          gk    sJ t          d |D                       sJ d S )Nsample-doc.htmlwa  <html>
  <body>
    <h1>A Great and Glorious Section</h1>
    <p>Dear Leader is the best. He is such a wonderful engineer!</p>
    <p></p>
    <p>Another Magnificent paragraph</p>
    <p><b>The prior element is a title based on its capitalization patterns!</b></p>
    <table>
      <tbody>
        <tr>
          <td><p>I'm in a table</p></td>
        </tr>
      </tbody>
    </table>
    <h2>A New Beginning</h2>
    <div>Here is the start of a new page.</div>
  </body>
</html>
   zA Great and Glorious Sectionz9Dear Leader is the best. He is such a wonderful engineer!zAnother Magnificent paragraphzBThe prior element is a title based on its capitalization patterns!zI'm in a tablezA New Beginningz Here is the start of a new page.c              3  6   K   | ]}|j         j        d k    V  dS )r   Nmetadatafilename.0es     k/var/www/html/ai-engine/env/lib/python3.11/site-packages/test_unstructured/partition/html/test_partition.py	<genexpr>z:test_partition_html_accepts_a_file_path.<locals>.<genexpr>Q   s-      JJAqz"&77JJJJJJ    )	stropenwriter   lenr   r   r   all)r   	file_pathfelementss       r(   'test_partition_html_accepts_a_file_pathr3   -   sS   H0011I	i		 
		
 	
 	

 
 
 
 
 
 
 
 
 
 
 
 
 
 
, i((Hx==A,--QRR-..Z[[  899     JJJJJJJJJJJJs   AA	A	c                   | dz  }|                     t          d                     |                    d           t          t	          |                                                    }t          |          dk    sJ d S )Nzexample-10k-readonly.htmlexample-10k-1p.htmli$  )r$   r   )
write_textr
   chmodr   r+   resolver.   )r   read_only_file_pathr2   s      r(   :test_user_without_file_write_permission_can_partition_htmlr:   T   s    "%@@""#34I#J#JKKKe$$$s+>+F+F+H+H'I'IJJJHx==1r*   c                     t          t          d          d          5 } t          |           }d d d            n# 1 swxY w Y   t          |          dk    sJ t	          d |D                       sJ d S )Nr5   rbfiler   c              3  2   K   | ]}|j         j        d u V  d S Nr"   r%   s     r(   r)   zAtest_partition_html_accepts_a_file_like_object.<locals>.<genexpr>c   s,      ==qqz"d*======r*   r,   r	   r   r.   r/   r1   r2   s     r(   .test_partition_html_accepts_a_file_like_objectrC   ^   s    	455t	<	< *!q)))* * * * * * * * * * * * * * * x==1==H==========   ;??c                 j    t          t          d                    } t          |           dk    sJ d S )Nr5   textr   )r   r
   r.   r2   s    r(   'test_partition_html_accepts_an_html_strrI   f   s9    #34I#J#JKKKHx==1r*   requests_get_r   c                    t          t          d          dddi          | _        t          d          }|                     di d	           t          |          d
k    sJ d S )Nr5      Content-Type	text/htmlrG   status_codeheadershttps://fake.urlurlTrQ   verifyr   )FakeResponser
   return_valuer   assert_called_once_withr.   rJ   r2   s     r(   5test_partition_html_accepts_a_url_to_an_HTML_documentr[   k   s}    !-344-" " "M "4555H))*<bQU)VVVx==1r*   c                     t          j        t          d          5  t                       d d d            d S # 1 swxY w Y   d S )Nz6Exactly one of filename, file, text, or url must be spmatch)pytestraises
ValueErrorr    r*   r(   Ktest_partition_html_raises_when_no_path_or_file_or_text_or_url_is_specifiedrc   x   s    	z)a	b	b	b                   s   8<<r$   example-10k-utf-16.html&example-steelJIS-datasheet-utf-16.htmlr+   c                    t          j        t                    5  t          t	          |           d          5 }t          |d           d d d            n# 1 swxY w Y   d d d            d S # 1 swxY w Y   d S )Nr<   utf-8r>   encodingr_   r`   UnicodeDecodeErrorr,   r	   r   r$   r1   s     r(   Htest_partition_html_from_filename_raises_when_explicit_encoding_is_wrongrm      s     
)	*	* 5 5"8,,d33 	5qG4444	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	55 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5s4   A.A
A.A	A.A	A..A25A2)rd   re   fake-html-lang-de.htmlc                     t          t                               }t          |          dk    sJ t           fd|D                       sJ  dk    r|t          k    sJ d S d S )Nr   c              3  8   K   | ]}|j         j        k    V  d S r@   r"   )r&   r'   r$   s     r(   r)   zEtest_partition_html_from_filename_default_encoding.<locals>.<genexpr>   s-      AA1qz"h.AAAAAAr*   rn   )r   r	   r.   r/   EXPECTED_OUTPUT_LANGUAGE_DE)r$   r2   s   ` r(   2test_partition_html_from_filename_default_encodingrr      s    
 .x8899Hx==1AAAAAAAAAAAA+++66666 ,+66r*   c                F   t          t          |           d          5 }t          j        |                                          }d d d            n# 1 swxY w Y   t          j        t          d          5  t          |d           d d d            d S # 1 swxY w Y   d S )Nr<   ,'utf-8' codec can't decode byte 0xff in posir]   rg   rh   )	r,   r	   ioBytesIOreadr_   r`   rk   r   )r$   r1   r>   s      r(   3test_partition_html_from_file_raises_encoding_errorrx      s    
x(($	/	/ $1z!&&((##$ $ $ $ $ $ $ $ $ $ $ $ $ $ $ 
)1_	`	`	` 4 4D733334 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4s#   'AAA7BBBc                    t          t          |           d          5 }t          |          }d d d            n# 1 swxY w Y   t          |          dk    sJ | dk    r|t          k    sJ d S d S Nr<   r=   r   rn   r,   r	   r   r.   rq   r$   r1   r2   s      r(   .test_partition_html_from_file_default_encodingr}          
 
x(($	/	/ *1!q)))* * * * * * * * * * * * * * * x==1+++66666 ,+66rD   c                    t          j        t          d          5  t          t	          |           d          5 }t          |d           d d d            n# 1 swxY w Y   d d d            d S # 1 swxY w Y   d S )Nrt   r]   r<   rg   rh   rj   rl   s     r(   6test_partition_html_from_file_rb_raises_encoding_errorr      s     
)1_	`	`	` 5 5"8,,d33 	5qG4444	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	55 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5s4   A0AA0A	A0A	 A00A47A4c                    t          t          |           d          5 }t          |          }d d d            n# 1 swxY w Y   t          |          dk    sJ | dk    r|t          k    sJ d S d S rz   r{   r|   s      r(   1test_partition_html_from_file_rb_default_encodingr      r~   rD   c                 P    d} t          |           }|d         j        dk    sJ d S )Nu+   <html><div><p>每日新闻</p></div></html>rF   r   u   每日新闻)r   rG   	html_textr2   s     r(   /test_partition_html_processes_chinese_chractersr      s5    =I9---HA;~------r*   c                 N    t          d          t          d          gk    sJ d S )Nz3<html charset="utf-8"><p>Hello &#128512;</p></html>rF   u
   Hello 😀r   r   rb   r*   r(   'test_emoji_appears_with_emoji_utf8_coder      s@    TUUUlZ      r*   c                    t          t          d          dddi          | _        t          j        t
          d          5  t          d	           d d d            d S # 1 swxY w Y   d S )
Nr5   i  rM   rN   rO   z-Error status code on GET of provided URL: 500r]   rR   rS   rW   r
   rX   r_   r`   ra   r   rJ   s    r(   Ctest_partition_html_from_url_raises_on_failure_response_status_coder      s    !-344-" " "M 
z)X	Y	Y	Y / /-..../ / / / / / / / / / / / / / / / / /   A  A$'A$c                    t          t          d          dddi          | _        t          j        t
          d          5  t          d	           d d d            d S # 1 swxY w Y   d S )
Nr5   rL   rM   zapplication/jsonrO   z6Expected content type text/html. Got application/json.r]   rR   rS   r   r   s    r(   Etest_partition_html_from_url_raises_on_response_of_wrong_content_typer      s    !-344!34" " "M 
z)a	b	b	b / /-..../ / / / / / / / / / / / / / / / / /r   c                    t          ddddi          | _        t          dddi	           |                     dddid
           d S )NzS<html><head></head><body><p>What do I know? Who needs to know it?</p></body></html>rL   rM   rN   rO   zhttps://example.comz
User-Agenttest)rT   rQ   TrU   )rW   rX   r   rY   r   s    r(   <test_partition_from_url_includes_provided_headers_in_requestr      su    !-b-" " "M ,|V6LMMMM))f'=d *     r*   c                     t          t          d                    } t          |           dk    sJ | d         }|t          d          k    sJ |j        j        J |j        j        J |j        j        J d S )Nzideas-page.html   r   a  January 2023 ( Someone fed my essays into GPT to make something that could answer
questions based on them, then asked it where good ideas come from.  The
answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,
or missing, or broken? You can see anomalies in everyday life (much
of standup comedy is based on this), but the best place to look for
them is at the frontiers of knowledge. Knowledge grows fractally.
From a distance its edges look smooth, but when you learn enough
to get close to one, you'll notice it's full of gaps. These gaps
will seem obvious; it will seem inexplicable that no one has tried
x or wondered about y. In the best case, exploring such gaps yields
whole new fractal buds.)r   r	   r.   r   r#   emphasized_text_contents	link_urlstext_as_htmlr2   r'   s     r(   !test_partition_html_on_ideas_pager      s    ./@AABBHx==AA	$      :.666:''':".....r*   c                 t    t          t          d                    } t          d | D                       sJ d S )Nzexample-with-scripts.htmlc              3  (   K   | ]}d |j         vV  dS )z
function (NrF   r%   s     r(   r)   z?test_it_does_not_extract_text_in_script_tags.<locals>.<genexpr>  s)      <<a|16)<<<<<<r*   )r   r	   r/   rH   s    r(   ,test_it_does_not_extract_text_in_script_tagsr     sB    ./JKKLLH<<8<<<<<<<<<<r*   c                 v    d} t          |           \  }t          |t                    sJ |j        dk    sJ d S )Nzc<html>
<body>
  <p><style> p { margin:0; padding:0; } </style>Lorem ipsum dolor</p>
</body>
</html>rF   zLorem ipsum dolor)r   
isinstancer   rG   r   elements     r(   +test_it_does_not_extract_text_in_style_tagsr     sP    	   Y///JWgt$$$$$<.......r*   c                     d} t          |           \  }t          |t                    sJ |j        dk    sJ |j        j        dk    sJ dS )z@Bare-bones means no `<thead>`, `<tbody>`, or `<tfoot>` elements.z<html>
<body>
  <table>
    <tr><td>Lorem</td><td>Ipsum</td></tr>
    <tr><td>Ut enim non</td><td>ad minim
veniam quis</td></tr>
  </table>
</body>
</html>rF   z,Lorem Ipsum Ut enim non ad minim
veniam quiszr<table><tr><td>Lorem</td><td>Ipsum</td></tr><tr><td>Ut enim non</td><td>ad minim<br/>veniam quis</td></tr></table>N)r   r   r   rG   r#   r   r   s     r(   7test_it_can_parse_a_bare_bones_table_to_a_Table_elementr   .  sv    	   Y///JWgu%%%%%<JJJJJ(	     r*   c                     d} t          |           \  }t          |t                    sJ |j        j        dk    sJ dS )a	  Cells within a `table/thead` element are included in the text and html.

    The presence of a `<thead>` element in the original also determines whether a `<thead>` element
    appears in `.text_as_html` or whether the first row of cells is simply in the body.
    W  <html>
<body>
  <table>
    <thead>
      <tr><th>Lorem</th><th>Ipsum</th></tr>
    </thead>
    <tbody>
      <tr><th>Lorem ipsum</th><td>dolor sit amet nulla</td></tr>
      <tr><th>Ut enim non</th><td>ad minim
veniam quis</td></tr>
    </tbody>
    <tfoot>
      <tr><th>Dolor</th><td>Equis</td></tr>
    </tfoot>
  </table>
</body>
</html>rF   z<table><tr><td>Lorem</td><td>Ipsum</td></tr><tr><td>Lorem ipsum</td><td>dolor sit amet nulla</td></tr><tr><td>Ut enim non</td><td>ad minim<br/>veniam quis</td></tr><tr><td>Dolor</td><td>Equis</td></tr></table>N)r   r   r   r#   r   r   s     r(   Ttest_it_accommodates_column_heading_cells_enclosed_in_thead_tbody_and_tfoot_elementsr   I  sb    	 &  Y///JWgu%%%%%(	     r*   c                 6    d} t          |           g k    sJ d S )Nzv<html>
<body>
  <table>
    <tr><td> </td><td> </td></tr>
    <tr><td> </td><td> </td></tr>
  </table>
</body>
</html>rF   r   )r   s    r(   >test_it_does_not_emit_a_Table_element_for_a_table_with_no_textr   o  s0    	  y)))R//////r*   c                     d} t          |           \  }|j        j        }|J t          j        |t          j                              }|J t          j        |t                    dk    sJ d S )Nr   rF   )ri   z<html><body><table><tr><td>Lorem</td><td>Ipsum</td></tr><tr><td>Lorem ipsum</td><td>dolor sit amet nulla</td></tr><tr><td>Ut enim non</td><td>ad minim<br/>veniam quis</td></tr><tr><td>Dolor</td><td>Equis</td></tr></table></body></html>)r   r#   r   r   
fromstring
HTMLParsertostringr+   )r   r   r   htmls       r(   /test_it_provides_parseable_HTML_in_text_as_htmlr   ~  s    	 &  Y///JW#0L###L%*:*<*<==D>$---		 	 	 	 	 	r*   )tagexpected_text_as_html)thead:<table><tr><td>Header 1</td><td>Header 2</td></tr></table>)tfootr   r   r   c                d    t          d|  d|  d          }|d         j        j        |k    sJ d S )Nz<table>
  <z6>
    <tr><th>Header 1</th><th>Header 2</th></tr>
  </z
>
</table>rF   r   )r   r#   r   )r   r   r2   s      r(   .test_partition_html_parses_table_without_tbodyr     sb           H A;,0EEEEEEEr*   c                 |    d} t          |           \  }|t          d          k    sJ |j        j        dk    sJ d S )Nz<table>
 <tr>
  <td>
   <table>
     <tr><td>foo</td><td>bar</td></tr>
     <tr><td>baz</td><td>bng</td></tr>
   </table>
  </td>
  <td>
   <table>
     <tr><td>fizz</td><td>bang</td></tr>
   </table>
  </td>
 </tr>
</table>rF   zfoo bar baz bng fizz bangzB<table><tr><td>foo bar baz bng</td><td>fizz bang</td></tr></table>r   r   r#   r   r   s     r(   Ztest_partition_html_reduces_a_nested_table_to_its_text_placed_in_the_cell_that_contains_itr     sd    	 $  Y///JWe7888888(L     r*   c                 |    d} t          |           \  }|t          d          k    sJ |j        j        dk    sJ dS )z)Like this example from an SEC 10k filing.a  <table>
 <tr>
  <td></td>
  <td></td>
 </tr>
 <tr>
  <td>
   <p>
    <span>
     <ix:nonNumeric id="F_be4cc145-372a-4689-be60-d8a70b0c8b9a" contextRef="C_1de69f73-df01-4830-8af0-0f11b469bc4a" name="dei:DocumentAnnualReport" format="ixt-sec:boolballotbox">
     <span>&#9746;</span>
     </ix:nonNumeric>
    </span>
   </p>
  </td>
  <td>
   <p>
    <span>ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934</span>
   </p>
  </td>
 </tr>
</table>
rF   uX   ☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934u   <table><tr><td/><td/></tr><tr><td>☒</td><td>ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934</td></tr></table>Nr   r   s     r(   8test_partition_html_accommodates_tds_with_child_elementsr     st    	 8  Y///JWeb      (	     r*   c                 <   d} t          |           }t          |          dk    sJ |d         }|t          d          k    sJ |j        j        J |d         }|t          d          k    sJ |j        j        dk    sJ |d         }|t          d	          k    sJ |j        j        dk    sJ |d
         }|t          d          k    sJ |j        j        dk    sJ |d         }|t          d          k    sJ |j        j        d
k    sJ |d         }|t          d          k    sJ |j        j        dk    sJ |d         }|t          d          k    sJ |j        j        dk    sJ |d         }|t          d          k    sJ |j        j        dk    sJ d S )Naa  <p>This is narrative text, it's long, flows and has meaning</p>
<h1>This heading is a title, even though it's long, flows and has meaning</h1>
<h2>A heading that is at the second level</h2>
<h3>Finally, the third heading</h3>
<h4>December 1-17, 2017</h4>
<h5>email@example.com</h5>
<h6>* bullet point</h6>
<h3><li>- invalidly nested list item</li></h3>
rF      r   z8This is narrative text, it's long, flows and has meaningr   zEThis heading is a title, even though it's long, flows and has meaning   z%A heading that is at the second level   zFinally, the third heading   zDecember 1-17, 2017   zemail@example.com   z* bullet pointr    z- invalidly nested list item)r   r.   r   r#   category_depthr   r   r   r2   r'   s      r(   Dtest_partition_html_recognizes_h1_to_h6_as_Title_with_category_depthr     s   	;  9---Hx==AAXYYYYYY:$,,,A]^^^^^^:$))))A=>>>>>>:$))))A2333333:$))))A+,,,,,,:$))))A)******:$))))A&'''''':$))))A7888888:$))))))r*   c                    t          t          d                    } t          dt          |                      t          |           dk    sJ t	          | d         j                                      d          sJ t          | d         t                    sJ | d         j	        j
        dk    sJ | d         j	        j        dk    sJ d S )Nzfake-html-pre.htmzlen(elements)=r   z[107th Congress Public Law 56]rN   )r   r	   printr.   r   rG   
startswithr   r   r#   filetyper$   rH   s    r(   4test_partition_html_with_widely_encompassing_pre_tagr   .  s    ./BCCDDH	
S]]

x==1!(1+"233>>?_`````hqk=11111A;(K7777A;(,???????r*   c                     t          d          t          d          t          d          t          d          t          d          gk    sJ d S )Nz<pre>The Big Brown Bear</pre>
<div>The big brown bear is growling.</div>
<pre>The big brown bear is sleeping.</pre>
<div>The Big Blue Bear</div>
rF   zThe Big Brown BearzThe big brown bear is growling.zThe big brown bear is sleeping.zThe Big Blue Bearr   r   r   rb   r*   r(   #test_pre_tag_parsing_respects_orderr   9  sn    -   	"##788788!""	
     r*   c                 
   d} t          |           }|t          d          t          d          t          d          t          d          gk    sJ |d         }|j        j        g dk    sJ |j        j        g d	k    sJ d S )
Na*  <!DOCTYPE html>
<html>
<body>
<div>
  <h1>Header 1</h1>
  <p>Text</p>
  <h2>Header 2</h2>
  <pre>
    <b>Param1</b> = Y<br><b>Param2</b> = 1<br><b>Param3</b> = 2<br><b>Param4</b> = A
    <br><b>Param5</b> = A,B,C,D,E<br><b>Param6</b> = 7<br><b>Param7</b> = Five<br>
  </pre>
</div>
</body>
</html>
rF   zHeader 1r   zHeader 2zd    Param1 = Y
Param2 = 1
Param3 = 2
Param4 = A
    
Param5 = A,B,C,D,E
Param6 = 7
Param7 = Five

  r   )Param1Param2Param3Param4Param5Param6Param7)br   r   r   r   r   r   )r   r   r   r#   r   emphasized_text_tagsr   s      r(   "test_partition_html_br_tag_parsingr   I  s    	 " 9---Hjfj.	
 	
	     	A:. 3 3 3     :*.Q.Q.QQQQQQQr*   c                     d} t          |           }|t          d          t          d          t          d          gk    sJ d S )Nz`<html>
<body>
<div>
    Head
    <div><span>Nested</span></div>
    Tail
</div>
</body>
</html>
rF   HeadNestedTailr   r   s     r(   $test_partition_html_tag_tail_parsingr   t  sN    	  9---HfuXfFFFFFFFr*   c                 2    t          d          g k    sJ d S )N rF   r   rb   r*   r(   5test_partition_html_from_text_works_with_empty_stringr     s$    r"""b((((((r*   c                     d} t          |           }|t          d          t          d          t          d          gk    sJ |d         j        j        dgk    sJ |d         j        j        dgk    sJ d S )	Na  
    <div>
      We start out normally
      <cite>
        and then add a citation
        <p>But whoa, this is a paragraph inside a phrasing element.</p>
        so we close the first element at the start of the block element and emit it, then we
        <b>emit</b> the block element,
      </cite>
      and then start a new element for the tail and whatever phrasing follows it.
    </div>
    rF   z-We start out normally and then add a citationz8But whoa, this is a paragraph inside a phrasing element.zso we close the first element at the start of the block element and emit it, then we emit the block element, and then start a new element for the tail and whatever phrasing follows it.r   emitr   )r   r   r#   r   r   r   s     r(   Jtest_partition_html_accommodates_block_item_nested_inside_phrasing_elementr     s    I 9---HEFFPQQ[	
 	
     A;8VHDDDDA;4======r*   c                 :   d} t          |           }d |D             g dk    sJ |d         }|j        j        dgk    sJ |j        j        dgk    sJ t	          d |d	d          D                       sJ t	          d
 |d	d          D                       sJ d S )Nz
    <div>
      O Deep Thought
      <a href="http://eie.io">
        computer, he said,
        <p>The task we have designed you to perform is this.</p>
        We want you to tell us....
      </a>
      he paused,
    </div>
    rF   c                    g | ]	}|j         
S rb   rF   r%   s     r(   
<listcomp>zMtest_partition_html_handles_anchor_with_nested_block_item.<locals>.<listcomp>  s    %%%qAF%%%r*   )z!O Deep Thought computer, he said,z1The task we have designed you to perform is this.z%We want you to tell us.... he paused,r   zcomputer, he said,zhttp://eie.ioc              3  2   K   | ]}|j         j        d u V  d S r@   )r#   
link_textsr%   s     r(   r)   zLtest_partition_html_handles_anchor_with_nested_block_item.<locals>.<genexpr>  s,      CCqz$,CCCCCCr*   r   c              3  2   K   | ]}|j         j        d u V  d S r@   )r#   r   r%   s     r(   r)   zLtest_partition_html_handles_anchor_with_nested_block_item.<locals>.<genexpr>  s,      BBqz#t+BBBBBBr*   )r   r#   r   r   r/   )r   r2   link_annotated_elements      r(   9test_partition_html_handles_anchor_with_nested_block_itemr     s    
I 9---H%%H%%% * * *    
 &a[!*5:N9OOOOO!*48IIIIICChqrrlCCCCCCCCBBXabb\BBBBBBBBBBr*   c                 6   d} t          |           }|t          d          t          d          t          d          t          d          t          d          t          d          t          d	          t	          d
          t          d          g	k    sJ d S )Na  <div dir=3D"ltr">Hi All,
  <div><br></div>
  <div>Get excited for our first annual family day!</div>
  <div>Best.<br clear="all">
    <div><br></div>
    -- <br>
    <div dir=3D"ltr">
      <div dir=3D"ltr">Dino the Datasaur<div>
      Unstructured Technologies<br>
      <div>Data Scientist</div>
      <div>Doylestown, PA 18901</div>
      <div><br></div>
    </div>
  </div>
  See you there!
</div>
rF   zHi All,z,Get excited for our first annual family day!zBest.z--zDino the DatasaurzUnstructured TechnologieszData ScientistzDoylestown, PA 18901zSee you there!)r   r   r   r   r   r   s     r(   'test_containers_with_text_are_processedr     s    	 & 9---HYDEEgT

!"")**&''&''
 
 
 
 
 
 
r*   c                 r    d} t          |           }|t          d          t          d          gk    sJ d S )Nz<html>
  <body>
    <ol>
      <li>Happy Groundhog's day!</li>
      <li>Looks like six more weeks of winter ...</li>
    </ol>
  </body>
</html>
rF   Happy Groundhog's day!'Looks like six more weeks of winter ...r   r   r   s     r(   %test_html_grabs_bulleted_text_in_tagsr     s\    	  9---H)**:;;      r*   c                 r    d} t          |           }|t          d          t          d          gk    sJ d S )Nz<html>
  <body>
    <p>
      <span>&#8226; Happy Groundhog's day!</span>
    </p>
    <p>
      <span>&#8226; Looks like six more weeks of winter ...</span>
    </p>
  </body>
</html>
rF   r   r   r   r   s     r(   &test_html_grabs_bulleted_text_in_parasr     s^    		  9---H )**:;;      r*   c                 R    t          d          } | t          d          gk    sJ d S )Nz&<p>Hello again peet mag<i>ic</i>al</p>rF   zHello again peet magicalr   rH   s    r(   test_joins_tag_text_correctlyr     s6    #KLLLH899:::::::r*   c                 R    t          d          } | t          d          gk    sJ d S )Nu8   <html charset="unicode">
<p>Hello again 😀</p>
</html>rF   u   Hello again 😀)r   r   rH   s    r(   test_sample_doc_with_emojir     s6    #_```H&899:::::::r*   c                 R    t          d          } | t          d          gk    sJ d S )Nz<body>Hello</body>rF   Hellor   rH   s    r(   &test_only_text_and_no_elements_in_bodyr     s3    #7888Hg'''''''r*   c                 n    t          d          } | t          d          t          d          gk    sJ d S )Nz<body>Hello<p>World</p></body>rF   r   Worldr   rH   s    r(   !test_text_before_elements_in_bodyr   "  s;    #CDDDHgg7777777r*   c                 R    t          d          } | t          d          gk    sJ d S )Nz<div>Hello<br/>World</div>rF   zHello Worldr   rH   s    r(   test_line_break_in_containerr   '  s5    #?@@@Hm,,-------r*   )delformnoscriptc                H    d|  d|  d}t          |          }|g k    sJ d S )Nz
<body>
  <z#>
    There is some text here.
  </z
>
</body>
rF   r   )r   r   r2   s      r(   test_exclude_tag_typesr  ,  s;    YcYYYYYI9---Hr>>>>>>r*   c                     t          d          } t          | d          }t          t          |                     }t          d |D                       sJ ||k    sJ d S )Nr5   by_title)chunking_strategyc              3  Z   K   | ]&}t          |t          t          t          f          V  'd S r@   )r   r   r   r   )r&   cs     r(   r)   zCtest_partition_html_can_chunk_while_partitioning.<locals>.<genexpr>@  s4      TTz!.zBCCTTTTTTr*   )r	   r   r   r/   )r0   chunkschunks_2s      r(   0test_partition_html_can_chunk_while_partitioningr  <  sq     !677IIDDDFnY7788HTTVTTTTTTTTXr*   c                 l    t          dd          t          d          t          d          gk    sJ d S )Nz<html>
  <header>
    <p>Header</p>
  </header>
  <body>
    <h1>My First Heading</h1>
    <p>It was a dark and stormy night. No one was around.</p>
  </body>
  <footer>
    <p>Footer</p>
  </footer>
</html>
T)rG   skip_headers_and_footerszMy First Headingz2It was a dark and stormy night. No one was around.r   rb   r*   r(   0test_partition_html_can_skip_headers_and_footersr  G  s[     "&  " 	 !!JKK
!     r*   c                     d t          t          d                    D             } t          |           t          t          |                     k    sJ d S )Nc                    g | ]	}|j         
S rb   idr%   s     r(   r   z3test_all_element_ids_are_unique.<locals>.<listcomp>b  s    
d
d
dA14
d
d
dr*   z&fake-html-with-duplicate-elements.html)r   r	   r.   set)idss    r(   test_all_element_ids_are_uniquer  a  sP    
d
d(89a(b(bcc
d
d
dCs88s3s88}}$$$$$$r*   c                 z    d t          d          D             } d t          d          D             }| |k    sJ d S )Nc                    g | ]	}|j         
S rb   r  r%   s     r(   r   z6test_element_ids_are_deterministic.<locals>.<listcomp>g  s    
_
_
_A14
_
_
_r*   z3example-docs/fake-html-with-duplicate-elements.htmlc                    g | ]	}|j         
S rb   r  r%   s     r(   r   z6test_element_ids_are_deterministic.<locals>.<listcomp>h  s    aaaaQTaaar*   r   )r  ids_2s     r(   "test_element_ids_are_deterministicr  f  sK    
_
_(]^^
_
_
_Caa>*_``aaaE%<<<<<<r*   c                 @   t          d          } t          |           dk    sJ | d         }t          |t                    sJ |j        dk    sJ |j        j        J |j        j        J | d         }t          |t                    sJ |j        dk    sJ |j        j        dk    sJ |j        j        J | d         }t          |t                    sJ |j        d	k    sJ |j        j        J |j        j        | d         j	        k    sJ | d
         }t          |t                    sJ |j        dk    sJ |j        j        dk    sJ |j        j        | d         j	        k    sJ | d         }t          |t                    sJ |j        dk    sJ |j        j        dk    sJ |j        j        | d         j	        k    sJ | d         }t          |t                    sJ |j        dk    sJ |j        j        J |j        j        | d         j	        k    sJ d S )Na  <html>
  <p>Preamble gets no category_depth or parent_id</p>
  <h1>Heading gets category_depth but no parent_id</h1>
  <p>Body paragraph gets parent_id but no category_depth</p>
  <ul>
    <li>List item gets category_depth and parent_id</li>
    <li>Second list item gets category_depth and parent_id</li>
  </ul>
  <p>Body paragraph after list gets parent_id but no category_depth</p>
</html>
rF   r   r   z,Preamble gets no category_depth or parent_idr   z,Heading gets category_depth but no parent_idr   z3Body paragraph gets parent_id but no category_depthr   z+List item gets category_depth and parent_idr   z2Second list item gets category_depth and parent_idr   z>Body paragraph after list gets parent_id but no category_depth)r   r.   r   r   rG   r#   r   	parent_idr   r  r   r   s     r(   .test_partition_html_records_hierarchy_metadatar  s  ss   	  H x==AAa'''''6CCCCC:$,,,:'''Aa6CCCCC:$)))):'''Aa'''''6JJJJJ:$,,,:8A;>1111Aa"""""6BBBBB:$)))):8A;>1111Aa"""""6IIIII:$)))):8A;>1111Aa'''''6UUUUU:$,,,:8A;>111111r*   c                    t          d          } | d         }|t          d          k    sJ |j        j        dgk    sJ |j        j        dgk    sJ | d         }|t          d          k    sJ |j        j        g d	k    sJ |j        j        g d
k    sJ | d         }|t          d          k    sJ |j        j        dgk    sJ |j        j        dgk    sJ | d         }|t          d          k    sJ |j        j        J |j        j        J | d         }|t          d          k    sJ |j        j        J |j        j        J d S )Nz<html>
  <p>Hello there I am a very <strong>important</strong> text!</p>
  <p>Here is a <span>list</span> of <b>my <i>favorite</i> things</b></p>
  <ul>
    <li><em>Parrots</em></li>
    <li>Dogs</li>
  </ul>
  <span>A lone span text!</span>
</html>
rF   r   z'Hello there I am a very important text!	importantr   r   $Here is a list of my favorite things)myfavoritethings)r   bir   r   Parrotsir   Dogsr   zA lone span text!)r   r   r#   r   r   r   r   r   s     r(   *test_partition_html_grabs_emphasized_textsr*    s     H 	AGHHHHHH:.;-????:*se3333ADEEEEEE:.2N2N2NNNNN:*.>.>.>>>>>A######:.9+====:*se3333A      :.666:*222A)******:.666:*22222r*   c                     t          t          d                    } t          |           dk    sJ t          d | D                       sJ t          d | D                       sJ d S )Nr5   r   c              3  6   K   | ]}|j         j        d k    V  dS )r5   Nr"   r%   s     r(   r)   zatest_partition_html_from_filename_uses_source_filename_for_metadata_by_default.<locals>.<genexpr>  s-      NNqz"&;;NNNNNNr*   c              3  P   K   | ]!}|j         j        t          d           k    V  "dS )r   N)r#   file_directoryr	   r%   s     r(   r)   zatest_partition_html_from_filename_uses_source_filename_for_metadata_by_default.<locals>.<genexpr>  s5      SSQqz(,<R,@,@@SSSSSSr*   r   r	   r.   r/   rH   s    r(   Ntest_partition_html_from_filename_uses_source_filename_for_metadata_by_defaultr0    s|    ./DEEFFHx==1NNXNNNNNNNNSS(SSSSSSSSSSr*   c                     t          t          d          d          } t          |           dk    sJ t          d | D                       sJ d S )Nr5   r   )metadata_filenamer   c              3  6   K   | ]}|j         j        d k    V  dS r   Nr"   )r&   r   s     r(   r)   zNtest_partition_html_from_filename_prefers_metadata_filename.<locals>.<genexpr>  s-      KKww(F2KKKKKKr*   r/  rH   s    r(   ;test_partition_html_from_filename_prefers_metadata_filenamer5    s_    ./DEEY_```Hx==1KK(KKKKKKKKKKr*   c                     t          t          d          d          5 } t          | d          }d d d            n# 1 swxY w Y   t          |          dk    sJ t	          d |D                       sJ d S )Nr5   r<   r   )r>   r2  r   c              3  6   K   | ]}|j         j        d k    V  dS r4  r"   r%   s     r(   r)   zJtest_partition_html_from_file_prefers_metadata_filename.<locals>.<genexpr>  s,      ??qz"f,??????r*   rA   rB   s     r(   7test_partition_html_from_file_prefers_metadata_filenamer8    s    	455t	<	< D!qFCCCD D D D D D D D D D D D D D D x==1??h??????????s   <A A c                 p    t          t          d                    } | d         j        j        dgk    sJ d S )Nr5   r   eng)r   r	   r#   	languagesrH   s    r(   2test_partition_html_element_metadata_has_languagesr<    s<    ./DEEFFHA;)eW444444r*   c                 |    t          t          d          d          } d | D             dgddgdgdgdggk    sJ d S )Nzlanguage-docs/eng_spa_mult.htmlT)detect_language_per_elementc                &    g | ]}|j         j        S rb   )r#   r;  r%   s     r(   r   zLtest_partition_html_respects_detect_language_per_element.<locals>.<listcomp>  s    333QAJ 333r*   r:  spa)r   r	   rH   s    r(   8test_partition_html_respects_detect_language_per_elementrA    sv    :;;Y]  H 43(333					8      r*   requestr   c                    t          | dd          }t          d          }t          |          }|                    |           |sJ t	          d |D                       sJ d S )N<unstructured.partition.html.partition.get_last_modified_date2024-06-17T22:22:20)rX   fake-html.htmlc              3  6   K   | ]}|j         j        d k    V  dS )rE  Nr#   last_modifiedr%   s     r(   r)   zXtest_partition_html_from_filename_pulls_last_modified_from_filesystem.<locals>.<genexpr>
  -      SSQqz'+@@SSSSSSr*   )r   r	   r   rY   r/   )rB  get_last_modified_date_r0   r2   s       r(   Etest_partition_html_from_filename_pulls_last_modified_from_filesystemrL    s    +F*  
 !!122Ii((H33I>>>OOOSS(SSSSSSSSSSr*   c                     t          t          d          d          } t          | d         t                    sJ t	          d | D                       sJ d S )NrF  2023-07-05T09:24:28)metadata_last_modifiedr   c              3  6   K   | ]}|j         j        d k    V  dS )rN  NrH  r%   s     r(   r)   zStest_partition_html_from_filename_prefers_metadata_last_modified.<locals>.<genexpr>  rJ  r*   )r   r	   r   r   r/   rH   s    r(   @test_partition_html_from_filename_prefers_metadata_last_modifiedrQ    sj    )**CX  H hqk5)))))SS(SSSSSSSSSSr*   c                    d} t          |           }|d         }|t          d          k    sJ |j        j        dgk    sJ |j        j        dgk    sJ |d         }|t          d          k    sJ |j        j        J |j        j        J |d	         }|t          d
          k    sJ |j        j        dgk    sJ |j        j        d
gk    sJ |d         }|t          d          k    sJ |j        j        J |j        j        J |d         }|t          d          k    sJ |j        j        dgk    sJ |j        j        dgk    sJ d S )Na  <html>
  <p>Hello there I am a <a href="/link">very important link!</a></p>
  <p>Here is a list of my favorite things</p>
  <ul>
    <li><a href="https://en.wikipedia.org/wiki/Parrot">Parrots</a></li>
    <li>Dogs</li>
  </ul>
  <a href="/loner">A lone link!</a>
</html>
rF   r   z'Hello there I am a very important link!/linkzvery important link!r   r"  r   r'  z$https://en.wikipedia.org/wiki/Parrotr   r)  r   A lone link!/loner)r   r   r#   r   r   r   r   r   s      r(   test_partition_html_grabs_linksrV    s   	  9---HAGHHHHHH:G9,,,,: %;$<<<<<ADEEEEEE:''': (((A######:$J#KKKKK: YK////A      :''': (((An%%%%%%:H:----: ^$4444444r*   c                    d} t          |           }|d         }|j        j        dgk    sJ |j        j        dgk    sJ |d         }|j        j        dgk    sJ |j        j        dgk    sJ |d	         }|j        j        dgk    sJ |j        j        dgk    sJ |d
         }|j        j        ddgk    sJ |j        j        ddgk    sJ d S )Nz<html>
  <a href="/loner">A lone link!</a>
  <p>Hello <a href="/link">link!</a></p>
  <p>
   Hello <a href="/link">link!</a></p>
  <p><a href="/wiki/parrots">Parrots</a> and <a href="/wiki/dogs">Dogs</a></p>
</html>
rF   r   rT  rU  r   zlink!rS  r   r   r'  r)  z/wiki/parrotsz
/wiki/dogs)r   r#   r   r   r   s      r(   test_partition_html_linksrX  >  s   	  9---HA: ^$44444:H:----A: WI----:G9,,,,A: WI----:G9,,,,A: Y$77777:O\#BBBBBBBr*   )r   expected_value)z:<table><tr><th>Header 1</th><th>Header 2</th></tr></table>r   )zd<table><tr><td>Dimensions</td><td>Weight</td></tr><tr><td>4'-6" x 1'</td><td>18 kg</td></tr></table>zs<table><tr><td>Dimensions</td><td>Weight</td></tr><tr><td>4&#x27;-6&quot; x 1&#x27;</td><td>18 kg</td></tr></table>r   rY  c                    t          |           }t          |          dk    sJ |d         j        j        |k    sJ d S )NrF   r   r   )r   r.   r#   r   )r   rY  r2   s      r(   <test_partition_html_applies_text_as_html_metadata_for_tablesr[  [  sL    . 9---Hx==AA;,>>>>>>r*   c                   t          t          d          dddi          | _        t          d          }|                     di d	           t          |          d
k    sJ t          d |D                       sJ d S )Nr5   rL   rM   rN   rO   https://trusttheforceluke.comrS   TrU   r   c              3  6   K   | ]}|j         j        d k    V  dS )r]  N)r#   rT   r%   s     r(   r)   zDtest_partition_html_from_url_adds_url_to_metadata.<locals>.<genexpr>  s,      SSQqz~!@@SSSSSSr*   )rW   r
   rX   r   rY   r.   r/   rZ   s     r(   1test_partition_html_from_url_adds_url_to_metadatar_  {  s    !-344-" " "M "ABBBH))*ISU^b)cccx==1SS(SSSSSSSSSSr*   c                 \    t          t          d                    } t          |            d S )Nr5   )r   r	   r   rH   s    r(   ,test_partition_html_round_trips_through_jsonra    s-    ./DEEFFH#H-----r*   uD   Jahresabschluss zum Geschäftsjahr vom 01.01.2020 bis zum 31.12.2020rF   c                      e Zd Zi fd	dZdS )
rW   rG   r+   rP   intrQ   dict[str, str]c                F    || _         || _        |dk     | _        || _        d S )Ni,  )rG   rP   okrQ   )selfrG   rP   rQ   s       r(   __init__zFakeResponse.__init__  s(    	&#r*   N)rG   r+   rP   rc  rQ   rd  )__name__
__module____qualname__rh  rb   r*   r(   rW   rW     s0        NP       r*   rW   returndict[str, Any]c            
         dddddi dddd	S )zAll default arguments for `HtmlPartitionerOptions`.

    Individual argument values can be changed to suit each test. Makes construction of opts more
    compact for testing purposes.
    NTF)	r>   r0   rG   ri   rT   rQ   
ssl_verifyr  detection_originrb   rb   r*   r(   	opts_argsrq    s.     $) 
 
 
r*   pytest.FixtureRequestc                "    t          | d          S )Nz2unstructured.partition.html.partition.requests.getr   )rB  s    r(   rJ   rJ     s    "VWWWr*   c                  f   e Zd ZdZej                            dddg          dd            Zej                            d	d
dg          d d            Zd!dZ	d!dZ
d!dZd"dZd!dZd#dZd!dZej                            dddg          d$d            Z ej                    d%d            ZdS )&DescribeHtmlPartitionerOptionszSUnit-test suite for `unstructured.partition.html.partition.HtmlPartitionerOptions`.rp  r   N
str | Nonerq  rm  c                B    ||d<   t          di |}|j        |k    sJ d S )Nrp  rb   )r   rp  )rg  rp  rq  optss       r(   -it_knows_the_caller_provided_detection_originzLDescribeHtmlPartitionerOptions.it_knows_the_caller_provided_detection_origin  s=     )9	$%%22	22$(8888888r*   ri   rg   c                B    ||d<   t          di |}|j        |k    sJ d S )Nri   rb   )r   ri   )rg  ri   rq  ry  s       r(   %it_knows_the_caller_provided_encodingzDDescribeHtmlPartitionerOptions.it_knows_the_caller_provided_encoding  s:     !)	*%22	22}((((((r*   c                    t          d          }||d<   t          di |}|j        }t          |t                    sJ |t          |          d         k    sJ d S )Nr5   r0   r   rb   )r	   r   r   r   r+   r   )rg  rq  r0   ry  r   s        r(   8it_gets_the_HTML_from_the_file_path_when_one_is_providedzWDescribeHtmlPartitionerOptions.it_gets_the_HTML_from_the_file_path_when_one_is_provided  sn    $%:;;	!*	+%22	22N	)S)))))M)44Q7777777r*   c                T   t          d          }t          |d          5 }t          j        |                                          }d d d            n# 1 swxY w Y   ||d<   t          di |}|j        }t          |t                    sJ |t          |          d         k    sJ d S )Nr5   r<   r>   r   rb   )
r	   r,   ru   rv   rw   r   r   r   r+   r   )rg  rq  r0   r1   r>   ry  r   s          r(   Cand_it_gets_the_HTML_from_the_file_like_object_when_one_is_providedzbDescribeHtmlPartitionerOptions.and_it_gets_the_HTML_from_the_file_like_object_when_one_is_provided  s     %%:;;	)T"" 	(a:affhh''D	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	&%22	22N	)S)))))M)44Q7777777s   'AAAc                B    d|d<   t          di |}|j        dk    sJ d S )Nz-<html><body><p>Hello World!</p></body></html>rG   rb   )r   r   rg  rq  ry  s      r(   ?and_it_uses_the_HTML_in_the_text_argument_when_that_is_providedz^DescribeHtmlPartitionerOptions.and_it_uses_the_HTML_in_the_text_argument_when_that_is_provided  s<     L	&%22	22~!PPPPPPPr*   rJ   r   c                t    t          ddddi          |_        d|d<   t          di |}|j        dk    sJ d S )	Nz?<html><body><p>I just flew over the internet!</p></body></html>rL   rM   rN   rO   zhttps://insta.tweet.face.orgrT   rb   )rW   rX   r   r   )rg  rJ   rq  ry  s       r(   6and_it_gets_the_HTML_from_the_url_when_one_is_providedzUDescribeHtmlPartitionerOptions.and_it_gets_the_HTML_from_the_url_when_one_is_provided   sb     &2R#[1&
 &
 &
"
 :	%%22	22~!bbbbbbbr*   c                    t          di |}t          j        t          d          5  |j         d d d            d S # 1 swxY w Y   d S )Nz3Exactly one of filename, file, text, or url must ber]   rb   )r   r_   r`   ra   r   r  s      r(   >but_it_raises_when_no_path_or_file_or_text_or_url_was_providedz]DescribeHtmlPartitionerOptions.but_it_raises_when_no_path_or_file_or_text_or_url_was_provided  s     &22	22]:-bccc 	 	NN	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   =AArK  c                ~    d|d<   d|_         t          di |}|j        }|                    d           |dk    sJ d S )Nza/b/document.htmlr0   z2024-04-02T20:32:35rb   )rX   r   rI  rY   )rg  rq  rK  ry  rI  s        r(   Dit_gets_last_modified_from_the_filesystem_when_file_path_is_providedzcDescribeHtmlPartitionerOptions.it_gets_last_modified_from_the_filesystem_when_file_path_is_provided  s_     "5	+/D,%22	22*778KLLL 5555555r*   c                f    t          j        d          }||d<   t          di |}|j        }|J d S )Ns   abcdefgr>   rb   )ru   rv   r   rI  )rg  rq  r>   ry  rI  s        r(   Rbut_it_falls_back_to_None_for_the_last_modified_date_when_no_file_path_is_providedzqDescribeHtmlPartitionerOptions.but_it_falls_back_to_None_for_the_last_modified_date_when_no_file_path_is_provided#  sJ     z*%% 	&%22	22*$$$$$r*   r  TFboolc                >    ||d<   t          di |}|j        |u sJ d S )Nr  rb   )r   r  )rg  r  rq  ry  s       r(   =it_knows_the_caller_provided_skip_headers_and_footers_settingz\DescribeHtmlPartitionerOptions.it_knows_the_caller_provided_skip_headers_and_footers_setting0  s>     1I	,-%22	22,0HHHHHHHr*   rB  r   rl  c                "    t          |d          S )NrD  rt  )rg  rB  s     r(   rK  z6DescribeHtmlPartitionerOptions.get_last_modified_date_;  s    S
 
 	
r*   )rp  rw  rq  rm  )ri   rw  rq  rm  rq  rm  )rJ   r   rq  rm  )rq  rm  rK  r   )r  r  rq  rm  )rB  r   rl  r   )ri  rj  rk  __doc__r_   markparametrizerz  r|  r~  r  r  r  r  r  r  r  fixturerK  rb   r*   r(   rv  rv    s       ]] [/&$@@9 9 9 A@9 [Z'499) ) ) :9)8 8 8 88 8 8 8Q Q Q Qc c c c   
6 
6 
6 
6	% 	% 	% 	% [7$GGI I I HGI V^
 
 
 
 
 
r*   rv  c                  J    e Zd ZdZddZddZddZddZddZdd	Z	dd
Z
dS )Describe_HtmlPartitionerzMUnit-test suite for `unstructured.partition.html.partition._HtmlPartitioner`.rq  rm  c                j    d|d<   t          di |}t          |          }|j        j        dk    sJ d S )Nz<body>
  <header></header>
  <p>Lots preamble stuff yada yada yada</p>
  <main>
    <h2>A Wonderful Section!</h2>
    <p>Look at this amazing section!</p>
  </main>
</body>
rG   mainrb   r   r   _mainr   rg  rq  ry  partitioners       r(   ,it_can_find_the_main_element_in_the_documentzEDescribe_HtmlPartitioner.it_can_find_the_main_element_in_the_documentG  sP     	& &22	22&t,, $......r*   c                j    d|d<   t          di |}t          |          }|j        j        dk    sJ dS )zQAnd there is always a <body>, the parser adds one if there's not one in the HTML.z<body>
  <header></header>
  <p>Lots preamble stuff yada yada yada</p>
  <h2>A Wonderful Section!</h2>
  <p>Look at this amazing section!</p>
</body>
rG   bodyNrb   r  r  s       r(   3and_it_falls_back_to_the_body_when_there_is_no_mainzLDescribe_HtmlPartitioner.and_it_falls_back_to_the_body_when_there_is_no_mainX  sP     	& &22	22&t,, $......r*   c                    d|d<   t          di |}t          t          j        |                    \  }|t	          d          k    sJ d S )Nz<p>NO PARTICULAR TYPE.</p>rG   zNO PARTICULAR TYPE.rb   )r   listr   iter_elementsr   rg  rq  ry  r   s       r(   Mit_produces_a_Text_element_when_the_tag_contents_are_not_narrative_or_a_titlezfDescribe_HtmlPartitioner.it_produces_a_Text_element_when_the_tag_contents_are_not_narrative_or_a_titlej  s\     9	&%22	22*8>>??
$455555555r*   c                    d|d<   t          di |}t          t          j        |                    \  }|t	          d          k    sJ d S )Nu   <p>● An excellent point!</p>rG   zAn excellent point!rb   )r   r  r   r  r   r  s       r(   Wit_produces_a_ListItem_element_when_the_tag_contains_are_preceded_by_a_bullet_characterzpDescribe_HtmlPartitioner.it_produces_a_ListItem_element_when_the_tag_contains_are_preceded_by_a_bullet_charactert  s\     =	&%22	22*8>>??
(#899999999r*   c                v    d|d<   t          di |}t          t          j        |                    g k    sJ d S )Nu
   <p>●</p>rG   rb   r   r  r   r  r  s      r(   Abut_not_when_the_tag_contains_only_a_bullet_character_and_no_textzZDescribe_HtmlPartitioner.but_not_when_the_tag_contains_only_a_bullet_character_and_no_text~  sL     )	&%22	22$248899R??????r*   c                v    d|d<   t          di |}t          t          j        |                    g k    sJ d S )Nz<p></p>rG   rb   r  r  s      r(   2it_produces_no_element_when_the_tag_has_no_contentzKDescribe_HtmlPartitioner.it_produces_no_element_when_the_tag_has_no_content  sJ    %	&%22	22$248899R??????r*   c                v    d|d<   t          di |}t          t          j        |                    g k    sJ d S )Nz<p>$</p>rG   rb   r  r  s      r(   <and_it_produces_no_element_when_the_tag_contains_only_a_stubzUDescribe_HtmlPartitioner.and_it_produces_no_element_when_the_tag_contains_only_a_stub  sL     '	&%22	22$248899R??????r*   Nr  )ri  rj  rk  r  r  r  r  r  r  r  r  rb   r*   r(   r  r  B  s        WW/ / / /"/ / / /$6 6 6 6: : : :@ @ @ @@ @ @ @@ @ @ @ @ @r*   r  )r   r   )rJ   r   )r$   r+   )r   r+   r   r+   )r   r+   )rB  r   )r   r+   rY  r+   )rl  rm  )rB  rr  )lr  
__future__r   ru   pathlibtypingr   r_   lxmlr   test_unstructured.unit_utilsr   r   r   r	   r
   r   unstructured.chunking.titler   unstructured.cleaners.corer   unstructured.documents.elementsr   r   r   r   r   r   r   r    unstructured.file_utils.encodingr   unstructured.partition.htmlr   %unstructured.partition.html.partitionr   r   r3   r:   rC   rI   r[   rc   r  r  rm   rr   rx   r}   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r  r  r*  r0  r5  r8  r<  rA  rL  rQ  rV  rX  r[  r_  ra  rq   rW   r  rq  rJ   rv  r  rb   r*   r(   <module>r     s;   E D " " " " " " 				                              7 6 6 6 6 6 = = = = = =	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ; : : : : : 6 6 6 6 6 6 Z Z Z Z Z Z Z Z$K $K $K $KN   > > >  

 
 
 
   *,TU 5 5 5 5 ccc 7 7 7	 7 *,TU 4 4 4 4 ccc 7 7 7	 7 *,TU 5 5 5 5 ccc 7 7 7	 7. . .  / / / // / / /   &/ / /6= = =
/ / /$  6# # #L0 0 0& & &R $OO 
F 
F 
F 
F  6) ) )^&* &* &*R@ @ @   (R (R (RVG G G*) ) )> > >:C C C6     F  (  .; ; ;
; ; ;
( ( (
8 8 8
. . .
  ; ; ;<<   =<    4% % %
  .2 .2 .2h!3 !3 !3NT T TL L L@ @ @5 5 5
  "T T T TT T T"5 "5 "5JC C C: #	

	
 (? ? ?) (?T T T T&. . . 
EUVVV 
           & X X X Xz
 z
 z
 z
 z
 z
 z
 z
zP@ P@ P@ P@ P@ P@ P@ P@ P@ P@r*   