
    Ng*                      d Z ddlmZ ddlmZmZ ddlZddlmZ ddl	m
Z
mZmZmZmZmZmZmZmZmZmZmZmZ ddlmZmZmZ ddlmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$  G d	 d
          Z% G d d          Z& G d d          Z' G d d          Z( G d d          Z) G d d          Z* G d d          Z+ G d d          Z, G d d          Z- G d d          Z. G d d          Z/ G d d           Z0 G d! d"          Z1dS )#z<Unit-test suite for the `unstructured.chunking.base` module.    )annotations)AnySequenceN)fragment_fromstring)ChunkingOptionsPreChunkBuilderPreChunkCombiner
PreChunkerTablePreChunkTextPreChunkTextPreChunkAccumulator_CellAccumulator_RowAccumulator_TableSplitter_TextSplitteris_on_next_pageis_title)HtmlCellHtmlRow	HtmlTable)	CheckBoxCompositeElementElementElementMetadata	PageBreakTable
TableChunkTextTitlec                     e Zd ZdZej                            dg d          d#d            Zd Zej                            ddd	g          d$d            Z	ej                            dddidfddidfddidfi dfg          d%d            Z
ej                            dddg          d&d            Zd Zd Zd Zd Zd Zd  Zd! Zd" ZdS )'DescribeChunkingOptionszIUnit-test suite for `unstructured.chunking.base.ChunkingOptions` objects.max_characters)r   intc                    t          j        t          d|           5  t          |                                           d d d            d S # 1 swxY w Y   d S )Nz+'max_characters' argument must be > 0, got matchr"   pytestraises
ValueErrorr   	_validate)selfr"   s     `/var/www/html/ai-engine/env/lib/python3.11/site-packages/test_unstructured/chunking/test_base.py/it_rejects_max_characters_not_greater_than_zerozGDescribeChunkingOptions.it_rejects_max_characters_not_greater_than_zero0   s    ]PPP
 
 
 	G 	G >:::DDFFF		G 	G 	G 	G 	G 	G 	G 	G 	G 	G 	G 	G 	G 	G 	G 	G 	G 	G   #AAAc                    	 t          d                                           dS # t          $ r t          j        d           Y dS w xY w)a:  Caller can specify `max_characters` arg without specifying any others.

        In particular, When `combine_text_under_n_chars` is not specified it defaults to the value
        of `max_characters`; it has no fixed default value that can be greater than `max_characters`
        and trigger an exception.
        2   r)   z3did not accept `max_characters` as option by itselfN)r   r.   r-   r+   failr/   s    r0   =it_does_not_complain_when_specifying_max_characters_by_itselfzUDescribeChunkingOptions.it_does_not_complain_when_specifying_max_characters_by_itself8   sd    	O2...88::::: 	O 	O 	OKMNNNNNN	Os   "& AA)combine_text_under_n_charsexpected_value)Nr   )*   r:   r8   
int | Noner9   c                @    t          |          }|j        |k    sJ dS )a  Subclasses can store `combine_text_under_n_chars` but must validate and enable it.

        The `combine_text_under_n_chars` option is not used by all chunkers and its behavior can
        differ between subtypes. It is present in and stored by the contructur but it defaults to
        `0` (no pre-chunk combining) and must be overridden by subclasses to give it the desired
        behavior.
        r8   N)r   r8   )r/   r8   r9   optss       r0   Qit_accepts_combine_text_under_n_chars_in_constructor_but_defaults_to_no_combiningziDescribeChunkingOptions.it_accepts_combine_text_under_n_chars_in_constructor_but_defaults_to_no_combiningD   s/     :TUUU..@@@@@@    )kwargsr9   include_orig_elementsTFNrA   dict[str, Any]boolc                0    t          di |j        |u sJ d S )N )r   rB   )r/   rA   r9   s      r0   ?it_knows_whether_to_include_orig_elements_in_the_chunk_metadatazWDescribeChunkingOptions.it_knows_whether_to_include_orig_elements_in_the_chunk_metadataT   s,     ((((>.PPPPPPr@   n_charsr#   r$   c                    t          j        t          d|           5  t          |                                           d d d            d S # 1 swxY w Y   d S )Nz/'new_after_n_chars' argument must be >= 0, got r'   new_after_n_charsr*   )r/   rH   s     r0   1it_rejects_new_after_n_chars_for_n_less_than_zerozIDescribeChunkingOptions.it_rejects_new_after_n_chars_for_n_less_than_zerob   s    ]MGMM
 
 
 	C 	C g666@@BBB		C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	Cr2   c                    t          j        t          d          5  t          dd                                           d d d            d S # 1 swxY w Y   d S )NzE'overlap' argument must be less than `max_characters`, got 300 >= 200r'      i,  r"   overlapr*   r6   s    r0   /it_rejects_overlap_not_less_than_max_characterszGDescribeChunkingOptions.it_rejects_overlap_not_less_than_max_charactersj   s    ]Y
 
 
 	I 	I 3<<<FFHHH		I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	Is   $AAAc                    t          d          }	 |                                 n$# t          $ r t          j        d           Y nw xY w|j        dk    sJ dS )zPCaller can specify `new_after_n_chars` arg without specifying any other options.rN   rJ   z6did not accept `new_after_n_chars` as option by itselfNr   r.   r-   r+   r5   soft_maxr/   r>   s     r0   @it_does_not_complain_when_specifying_new_after_n_chars_by_itselfzXDescribeChunkingOptions.it_does_not_complain_when_specifying_new_after_n_chars_by_itselfq   s|    555	RNN 	R 	R 	RKPQQQQQ	R }######s   ' AAc                h    t          d          }|                                 |j        dk    sJ dS )zSpecifying `new_after_n_chars=0` places each element into its own pre-chunk.

        This puts each element into its own chunk, although long chunks are still split.
        r   rJ   N)r   r.   rT   rU   s     r0   Iit_accepts_0_for_new_after_n_chars_to_put_each_element_into_its_own_chunkzaDescribeChunkingOptions.it_accepts_0_for_new_after_n_chars_to_put_each_element_into_its_own_chunk{   s=    
 333}!!!!!!r@   c                    t          dd          }	 |                                 n$# t          $ r t          j        d           Y nw xY w|j        dk    sJ dS )z`new_after_n_chars` > `max_characters` doesn't affect chunking behavior.

        So rather than raising an exception or warning, we just cap that value at `max_characters`
        which is the behavioral equivalent.
        i  i+  r"   rK   z@did not accept `new_after_n_chars` greater than `max_characters`NrS   rU   s     r0   ;it_silently_accepts_new_after_n_chars_greater_than_maxcharszSDescribeChunkingOptions.it_silently_accepts_new_after_n_chars_greater_than_maxchars   s     cSIII	\NN 	\ 	\ 	\KZ[[[[[	\ }######s   ( A	A	c                <    t          d          j        dk    sJ d S )N
   rP   )r   rP   r6   s    r0   2it_knows_how_much_overlap_to_apply_to_split_chunkszJDescribeChunkingOptions.it_knows_how_much_overlap_to_apply_to_split_chunks   s'    r***2b888888r@   c                >    t          dd          j        dk    sJ d S )Nr]   TrP   overlap_allr   inter_chunk_overlapr6   s    r0   Sand_it_uses_the_same_value_for_inter_chunk_overlap_when_asked_to_overlap_all_chunkszkDescribeChunkingOptions.and_it_uses_the_same_value_for_inter_chunk_overlap_when_asked_to_overlap_all_chunks   s*    rt<<<PTVVVVVVVr@   c                <    t          d          j        dk    sJ d S )Nr]   r^   r   rc   r6   s    r0   -but_it_does_not_overlap_pre_chunks_by_defaultzEDescribeChunkingOptions.but_it_does_not_overlap_pre_chunks_by_default   s'    r***>!CCCCCCr@   c                8    t                      j        dk    sJ d S )Nz

)r   text_separatorr6   s    r0   "it_knows_the_text_separator_stringz:DescribeChunkingOptions.it_knows_the_text_separator_string   s"      /6999999r@   )r"   r%   )r8   r;   r9   r%   )rA   rC   r9   rD   )rH   r%   )__name__
__module____qualname____doc__r+   markparametrizer1   r7   r?   rG   rL   rQ   rV   rX   r[   r_   re   rg   rj   rF   r@   r0   r!   r!   -   s       SS[-|||<<G G G =<G
O 
O 
O [89h:O A A A A [$%t,d3%u-u5%t,d3J		
 Q Q Q Q
 [YS	22C C C 32CI I I$ $ $" " "$ $ $9 9 9W W WD D D: : : : :r@   r!   c                      e Zd ZdZd ZdS )DescribePreChunkerzDUnit-test suite for `unstructured.chunking.base.PreChunker` objects.c                   t          d          t          d          t          d          t          d          t          d          t          d          t                      g}t          dd	          }t	          j        ||
          }t          |          }t          |t                    sJ |j	        t          d          t          d          gk    sJ t          |          }t          |t                    sJ |j	        t          d          gk    sJ t          |          }t          |t                    sJ |j	        t          d          t          d          gk    sJ t          |          }t          |t                    sJ |j	        t          d          t                      gk    sJ t          j        t                    5  t          |           d d d            d S # 1 swxY w Y   d S )NLorem Ipsumz8Lorem ipsum dolor sit amet, consectetur adipiscing elit.zBSed do eiusmod tempor incididunt ut labore et dolore magna aliqua.zUt EnimzHUt enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi.z#Ut aliquip ex ea commodo consequat.   A   rZ   r>   )r   r   r   r   r
   iter_pre_chunksnext
isinstancer   	_elementsr+   r,   StopIteration)r/   elementsr>   pre_chunk_iter	pre_chunks        r0   Git_gathers_elements_into_pre_chunks_respecting_the_specified_chunk_sizezZDescribePreChunker.it_gathers_elements_into_pre_chunks_respecting_the_specified_chunk_size   sS   -  KLLUVV)[\\677JJ
 cRHHH#3H4HHH((	)\22222"-  KLL'
 
 
 
 

 ((	)\22222"UVV'
 
 
 
 
 ((	)\22222")[\\'
 
 
 
 

 ((	)\22222"t,Q'R'RT\T^T^&_____]=)) 	! 	!   	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	!s   G..G25G2N)rk   rl   rm   rn   r   rF   r@   r0   rr   rr      s)        NN(! (! (! (! (!r@   rr   c            
      $   e Zd ZdZd Zd Zej                            d e	d           e
d          g          dd            Zej                            d	 e
d
           e	d          f e
d
           e
d          f e	d           e	d          f e	d           e
d          fg          dd            Zej                            d e
d
           e	d          g          dd            Zd Zd Zd Zd Zd Zd Zd Zd ZdS )DescribePreChunkBuilderzAUnit-test suite for `unstructured.chunking.base.PreChunkBuilder`.c                v    t          t          d                    }|j        dk    sJ |j        dk    sJ d S )Nr4   r)   rw   r   )r   r   _text_length_remaining_spacer/   builders     r0   it_is_empty_on_constructionz3DescribePreChunkBuilder.it_is_empty_on_construction   sK    !b'I'I'IJJJ#q(((('2------r@   c                2   t          t          d                    }|                    t          d                     |j        dk    sJ |j        dk    sJ |                    t          d                     |j        dk    sJ |j        d	k    sJ d S )
Nru   r)   rw   Introduction      bLorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sedlectus porta volutpat.p   $   )r   r   add_elementr   r   r   r   r   s     r0   #it_accumulates_elements_added_to_itz;DescribePreChunkBuilder.it_accumulates_elements_added_to_it   s    !c'J'J'JKKKE.11222#r))))'3....) 	
 	
 	
 #s****'2------r@   elementHeading
Cell texta  abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd r   c                l    t          t                                }|                    |          sJ d S Nrw   )r   r   will_fitr/   r   r   s      r0   3it_will_fit_a_Table_or_oversized_element_when_emptyzKDescribePreChunkBuilder.it_will_fit_a_Table_or_oversized_element_when_empty   s8    !'8'8999(((((((r@   )existing_elementnext_elementabcdzFruits
Mangor   r   c                    t          t                                }|                    |           |                    |          rJ d S r   )r   r   r   r   )r/   r   r   r   s       r0   7but_not_when_it_already_contains_an_element_of_any_kindzODescribePreChunkBuilder.but_not_when_it_already_contains_an_element_of_any_kind   sO     "'8'8999,---##L1111111r@   c                    t          t                                }|                    t          d                     |                    |          rJ d S )Nrw   r   )r   r   r   r   r   r   s      r0   <it_will_not_fit_any_element_when_it_already_contains_a_tablezTDescribePreChunkBuilder.it_will_not_fit_any_element_when_it_already_contains_a_table  sU    !'8'8999E"677888##G,,,,,,,r@   c                    t          t          dd                    }|                    t          d                     |                    t          d                    rJ d S )Nd   r4   rZ   rw   7Lorem ipsum dolor sit amet consectetur adipiscing elit.zIn rhoncus ipsum.r   r   r   r   r   r   s     r0   Bit_will_not_fit_an_element_when_it_already_exceeds_the_soft_maxlenzZDescribePreChunkBuilder.it_will_not_fit_an_element_when_it_already_exceeds_the_soft_maxlen	  so    !c]_'`'`'`aaaJKK	
 	
 	
 ##D)<$=$=>>>>>>>r@   c                    t          t          d                    }|                    t          d                     |                    t          d                    rJ d S )Nr   r)   rw   r   z,In rhoncus ipsum sed lectus portos volutpat.r   r   s     r0   Qand_it_will_not_fit_an_element_when_that_would_cause_it_to_exceed_the_hard_maxlenziDescribePreChunkBuilder.and_it_will_not_fit_an_element_when_that_would_cause_it_to_exceed_the_hard_maxlen  s{    !c'J'J'JKKKJKK	
 	
 	

 ##?@@
 
 	
 	
 	
 	
 	
r@   c                    t          t          d                    }|                    t          d                     |                    t          d                    sJ d S )Nr   r)   rw   r   z+In rhoncus ipsum sed lectus porto volutpat.r   r   s     r0   $but_it_will_fit_an_element_that_fitsz<DescribePreChunkBuilder.but_it_will_fit_an_element_that_fits  sl    !c'J'J'JKKKJKK	
 	
 	

 %R S STTTTTTTr@   c                   t          t          d                    }|                    t          d                     |                    t	          d                     t          |                                          }t          |t                    sJ |j	        t          d          t	          d          gk    sJ |j
        dk    sJ |j        dk    sJ d S )Nru   r)   rw   r   r   r   )r   r   r   r   r   ry   flushrz   r   r{   r   r   r/   r   r   s      r0   Cit_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_emptyz[DescribePreChunkBuilder.it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty%  s    !c'J'J'JKKKE.11222) 	
 	
 	
 ))	)\22222".!!) '
 
 
 
 
 #q(((('3......r@   c                ^   t          t          d                    }|                    t          d                     t	          |                                          }|j        dk    sJ |j        dk    sJ t          |t                    sJ |j
        t          d          k    sJ d S )Nru   r)   rw   r   r   )r   r   r   r   ry   r   r   r   rz   r   _tabler   s      r0   Aand_it_generates_a_TablePreChunk_when_it_contains_a_Table_elementzYDescribePreChunkBuilder.and_it_generates_a_TablePreChunk_when_it_contains_a_Table_element<  s    !c'J'J'JKKKE"677888))	 #q(((('3....)]333335)=#>#>>>>>>>r@   c                    t          t          d                    }t          |                                          }|g k    sJ |j        dk    sJ |j        dk    sJ d S )Nru   r)   rw   r   )r   r   listr   r   r   )r/   r   
pre_chunkss      r0   8but_it_does_not_generate_a_pre_chunk_on_flush_when_emptyzPDescribePreChunkBuilder.but_it_does_not_generate_a_pre_chunk_on_flush_when_emptyL  sq    !c'J'J'JKKK'--//**
R#q(((('3......r@   c                   t          dd          }t          |          }|                    t          d                     t	          |                                          d         }t          |t                    sJ |j        dk    sJ |                    t          d                     t	          |                                          d         }t          |t                    sJ |j        dk    sJ |                    t          d	                     t	          |                                          d         }t          |t                    sJ |j        d
k    sJ d S )N   Tra   rw   r   r   +In rhoncus ipsum sed lectus porta volutpat.z;dipiscing elit.
In rhoncus ipsum sed lectus porta volutpat.z%Donec semper facilisis metus finibus.z6porta volutpat.

Donec semper facilisis metus finibus.)r   r   r   r   r   r   rz   r   _textr   r   _text_with_overlap)r/   r>   r   r   s       r0   Bit_computes_overlap_from_each_pre_chunk_and_applies_it_to_the_nextzZDescribePreChunkBuilder.it_computes_overlap_from_each_pre_chunk_and_applies_it_to_the_nextU  sX   rt<<<!t,,,D!Z[[\\\))!,	)\22222"[[[[[E"OPPQQQ))!,	)]33333+J
 
 
 
 	D!HIIJJJ))!,	)\22222"\\\\\\\r@   c                    t          t          d                    }|                    t          d                     |                    t          d                     |j        dk    sJ |j        dk    sJ d S )Nr4   r)   rw   abcdefghijr   r   )r   r   r   r   r   r   r   s     r0   Lit_considers_separator_length_when_computing_text_length_and_remaining_spacezdDescribePreChunkBuilder.it_considers_separator_length_when_computing_text_length_and_remaining_spacem  s    !b'I'I'IJJJDMM***DMM*** #r)))) '2------r@   Nr   r   )r   r   r   r   )rk   rl   rm   rn   r   r   r+   ro   rp   r   r   r   r   r   r   r   r   r   r   r   r   r   rF   r@   r0   r   r      s       KK. . .. . .  [Y/C)D)Ddd=FYFY(Z[[) ) ) \[) [,T&\\55112T&\\44../U'((%%*@*@AU'(($$}*=*=>		
 2 2 2 2 [Yfuu_7M7M(NOO- - - PO-? ? ?	
 	
 	
U U U/ / /.? ? ? / / /] ] ]0. . . . .r@   r   c                      e Zd ZdZd Zd Zd Zd Zd Zd Z	e
j                            dd	d
g          dd            Ze
j                            dddg          dd            Zd Zd Zd ZdS )DescribeTablePreChunkzGUnit-test suite for `unstructured.chunking.base.TablePreChunk` objects.c                   d}d}t          t          |t          |                    dt          d                    }|                                }t          |          }t          |t                    sJ |j        d	k    sJ |j        j	        d
k    sJ t          j        t                    5  t          |           d d d            d S # 1 swxY w Y   d S )Nz<table>
<thead>
<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>
</thead>
<tbody>
<tr><td>Lorem ipsum  </td><td>adipiscing   </td></tr>
</tbody>
</table>3Header Col 1  Header Col 2
Lorem ipsum   adipiscingtext_as_htmlmetadatactus porta volutpat.   r)   overlap_prefixr>   zHctus porta volutpat.
Header Col 1  Header Col 2
Lorem ipsum   adipiscingzr<table><tr><td>Header Col 1</td><td>Header Col 2</td></tr><tr><td>Lorem ipsum</td><td>adipiscing</td></tr></table>)r   r   r   r   iter_chunksry   rz   textr   r   r+   r,   r|   r/   
html_table
text_tabler   
chunk_iterchunks         r0   >it_uses_its_table_as_the_sole_chunk_when_it_fits_in_the_windowzTDescribeTablePreChunk.it_uses_its_table_as_the_sole_chunk_when_it_fits_in_the_window  sM    	 L
!*J'O'O'OPPP1 444
 
 
	 **,,
Z  %'''''zX
 
 
 
 ~*
 
 
 
 ]=)) 	 		 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   4CCCc                .   d}t          t          dt          |                    dt          d                    }|                                }t          j        t                    5  t          |           d d d            d S # 1 swxY w Y   d S )	Nz/<table><tr><td/><td>  	  
   </td></tr></table>z  	  
  r   r   z	volutpat.r   r)   r   )	r   r   r   r   r   r+   r,   r|   ry   )r/   r   r   r   s       r0   >but_not_when_the_table_is_is_empty_or_contains_only_whitespacezTDescribeTablePreChunk.but_not_when_the_table_is_is_empty_or_contains_only_whitespace  s    H
!,j)Q)Q)QRRR& 444
 
 
	 **,,
]=)) 	 		 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   -B

BBc                   t          dt          d                    }t          d          }t          |d|          }|                                }t          |          }t          |t                     sJ |j        j        |gk    sJ |j        j	        dk    sJ t          j        t                    5  t          |           d d d            d S # 1 swxY w Y   d S )Nzfoo barz<table>foo bar</table>r   r   TrB    )r   r   r   r   r   ry   rz   r   orig_elementsr   r+   r,   r|   r/   tabler>   r   r   r   s         r0   Iand_it_includes_the_original_table_element_in_metadata_when_so_instructedz_DescribeTablePreChunk.and_it_includes_the_original_table_element_in_metadata_when_so_instructed  s&   i/G_*`*`*`aaaT:::!%T22	**,,
Z  %'''''~+w6666~*.FFFFF]=)) 	 		 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   9CCCc                    t          t          d          dt          d                    }t          |                                          }t          |t                    sJ |j        j        J d S )Nfoobarr   Fr   )r   r   r   ry   r   rz   r   r   )r/   r   r   s      r0   but_not_when_instructed_not_toz4DescribeTablePreChunk.but_not_when_instructed_not_to  sk    !%//2]b7c7c7cdd	Y**,,--%'''''~+33333r@   c                   d}d}t          t          |t          |                    dt          dd          	          }|                                }t          |          }t          |t                    sJ |j        d
k    sJ |j	        j
        dk    sJ |j	        j        J t          |          }t          |t                    sJ |j        dk    sJ |j	        j
        dk    sJ |j	        j        sJ t          |          }t          |t                    sJ |j        dk    sJ |j	        j
        dk    sJ |j	        j        sJ t          |          }t          |t                    sJ |j        dk    sJ |j	        j
        dk    sJ |j	        j        sJ t          j        t                    5  t          |           d d d            d S # 1 swxY w Y   d S )Na              <table>
            <thead>
            <tr><th>Header Col 1   </th><th>Header Col 2  </th></tr>
            </thead>
            <tbody>
            <tr><td>Lorem ipsum    </td><td>A Link example</td></tr>
            <tr><td>Consectetur    </td><td>adipiscing elit</td></tr>
            <tr><td>Nunc aliquam   </td><td>id enim nec molestie</td></tr>
            </tbody>
            </table>
        zHeader Col 1   Header Col 2
Lorem ipsum    dolor sit amet
Consectetur    adipiscing elit
Nunc aliquam   id enim nec molestie
Vivamus quis   nunc ipsum donec ac fermentumr   r   r   r   
 )r"   text_splitting_separatorsr   zHeader Col 1 Header Col 2zB<table><tr><td>Header Col 1</td><td>Header Col 2</td></tr></table>zLorem ipsum A Link examplezC<table><tr><td>Lorem ipsum</td><td>A Link example</td></tr></table>zConsectetur adipiscing elitzD<table><tr><td>Consectetur</td><td>adipiscing elit</td></tr></table>z!Nunc aliquam id enim nec molestiezJ<table><tr><td>Nunc aliquam</td><td>id enim nec molestie</td></tr></table>)r   r   r   r   r   ry   rz   r   r   r   r   is_continuationr+   r,   r|   r   s         r0   Kit_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_windowzaDescribeTablePreChunk.it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_window  s   
; 	 "*J'O'O'OPPP {[[[
 
 
	 **,,
Z  %,,,,,z88888~*P
 
 
 
 ~-555Z  %,,,,,z99999~*Q
 
 
 
 ~----Z  %,,,,,z:::::~*R
 
 
 
 ~----Z  %,,,,,z@@@@@~*X
 
 
 
 ~----]=)) 	 		 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   <GG Gc                   t          dt          d                    }t          dd          }t          |d|	          }|                                }t          |          }t          |t                    sJ |j        d
k    sJ |j	        j
        |gk    sJ |j	        j        rJ t          |          }t          |t                    sJ |j        dk    sJ |j	        j
        |gk    sJ |j	        j        sJ dS )zGEven though text and html are split, the orig_elements metadata is not.z8Header Col 1   Header Col 2
Lorem ipsum   dolor sit amet<table/>r   r      Tr"   rB   r   r   zHeader Col 1   Header Col 2zLorem ipsum   dolor sit ametN)r   r   r   r   r   ry   rz   r   r   r   r   r   r   s         r0   Land_it_includes_the_whole_original_Table_in_each_metadata_when_so_instructedzbDescribeTablePreChunk.and_it_includes_the_whole_original_Table_in_each_metadata_when_so_instructed  s&   G$*===
 
 
 bMMM!%FFF	**,,
Z  %,,,,,z:::::~+w6666>1111Z  %,,,,,z;;;;;~+w6666~------r@   r   r9   r   r   )z/In rhoncus ipsum sed lectus     porta volutpat.porta volutpat.r   strr9   c                |    t          t          |          dt          dd                    }|j        |k    sJ d S Nr      Tra   r   )r   r   r   overlap_tailr/   r   r9   r   s       r0   ?it_computes_its_overlap_tail_for_use_in_inter_pre_chunk_overlapzUDescribeTablePreChunk.it_computes_its_overlap_tail_for_use_in_inter_pre_chunk_overlap  sO     "$KKY]1^1^1^
 
 
	 %777777r@   )r   r   r9   )r   r   r   )r   r   z@ctus porta volutpat.
In rhoncus ipsum sed lectus porta volutpat.r   c                v    t          t          |          |t                                }|j        |k    sJ d S )Nr   )r   r   r   r   )r/   r   r   r9   r   s        r0   7it_includes_its_overlap_prefix_in_its_text_when_presentzMDescribeTablePreChunk.it_includes_its_overlap_prefix_in_its_text_when_present0  sG    $ "$KK_=N=N
 
 
	 +~======r@   c                    t          dt          d                    }t          |dt                                }|j        }|j        dk    sJ |j        |gk    sJ |j        |usJ d S )NLorem ipsumr   r   r   r   r   )r   r   r   r   	_metadatar   r   )r/   r   r   r   s       r0   +it_computes_metadata_for_each_chunk_to_helpzADescribeTablePreChunk.it_computes_metadata_for_each_chunk_to_helpG  s    mo:.V.V.VWWW!%ARARSSS	&$
2222%%0000 "(222222r@   c                    t          t          dt          d                    dt          d                    }|j        j        J d S )	Nr   r   r   r   r   Fr   r   )r   r   r   r   r   r   r/   r   s     r0   ;but_it_omits_orig_elements_from_metadata_when_so_instructedzQDescribeTablePreChunk.but_it_omits_orig_elements_from_metadata_when_so_instructedT  s\    !-/z*R*R*RSSS u===
 
 
	 "088888r@   c           	     4   t          dt          dt          d          g                    }t          |dt                                }|j        }t          |          dk    sJ |d	         }||k    sJ ||usJ |j        j        J |j        |u sJ d S )
Nr   r   rt   )r   r   r   r   r      r   )r   r   r   r   _orig_elementslenr   r   )r/   r   r   r   orig_elements        r0   .it_computes_the_original_elements_list_to_helpzDDescribeTablePreChunk.it_computes_the_original_elements_list_to_help]  s    $*US`MaMaLbccc
 
 
 "%ARARSSS	!0 =!!Q&&&&$Q' u$$$$5(((( $2:::'=888888r@   Nr   r   r9   r   )r   r   r   r   r9   r   )rk   rl   rm   rn   r   r   r   r   r   r   r+   ro   rp   r   r   r   r   r  rF   r@   r0   r   r     s0       QQ     D    4 4 4= = =~. . .. [" TR		
 8 8 8 8 [4
	
 > > > >3 3 39 9 99 9 9 9 9r@   r   c                     e Zd ZdZej                            ddddgdddgdfdddgdddgdfdddgdddgd	fdddgdd
dgd	fdddgdd
dgd	fg          d8d            Zd Zej                            dg d          d9d            Z	d Z
d Zd Zd Zd  Zej                            d!d"d#g          d:d%            Zd& Zd' Zd( Zd) Zd* Zej                            d+ ed           ed          gd,d-f ed           ed           ed          gd.d/f ed           ed          gd0d1f ed           ed           ed          gdd2fg          d;d6            Zd7S )<DescribeTextPreChunkzFUnit-test suite for `unstructured.chunking.base.TextPreChunk` objects.)overlap_pfxtextsother_overlap_pfxother_textsr9   foobarbazTr   fobFbahdahr  r   r  	list[str]r	  r
  r9   rD   c                    t                      }t          d |D             ||          }t          d |D             ||          }||k    |u sJ d S )Nc                ,    g | ]}t          |          S rF   r   .0ts     r0   
<listcomp>zcDescribeTextPreChunk.it_knows_when_it_is_equal_to_another_TextPreChunk_instance.<locals>.<listcomp>  s    !9!9!9a$q''!9!9!9r@   r   c                ,    g | ]}t          |          S rF   r  r  s     r0   r  zcDescribeTextPreChunk.it_knows_when_it_is_equal_to_another_TextPreChunk_instance.<locals>.<listcomp>  s    ***T!WW***r@   )r   r   )	r/   r  r  r	  r
  r9   r>   r   other_pre_chunks	            r0   :it_knows_when_it_is_equal_to_another_TextPreChunk_instancezODescribeTextPreChunk.it_knows_when_it_is_equal_to_another_TextPreChunk_instancew  s|    .    !9!95!9!9!9+\`aaa	&**k***;LSW
 
 
 _,??????r@   c                R    t          g dt                                }|dk    sJ d S )Nr   r   r:   )r   r   r   s     r0   Dand_it_knows_it_is_not_equal_to_an_object_that_is_not_a_TextPreChunkzYDescribeTextPreChunk.and_it_knows_it_is_not_equal_to_an_object_that_is_not_a_TextPreChunk  s-     B_=N=NOOO	Br@   )r"   r8   r9   ))r   I   T)r   H   F)c   r  Fr"   r%   r8   c                    t          ||dd          }t          t          d          gd|          }t          t          d          gd|          }|                    |          |u sJ d	S )
zSThis allows `PreChunkCombiner` to operate without knowing `TextPreChunk` internals.r   T)r"   r8   rP   rb   z2Lorem ipsum dolor sit amet consectetur adipiscing.e feugiat efficitur.r   zIn rhoncus sum sed lectus.zsectetur adipiscing.N)r   r   r   can_combine)r/   r"   r8   r9   r>   r   next_pre_chunks          r0   Fit_knows_when_it_can_combine_itself_with_another_TextPreChunk_instancez[DescribeTextPreChunk.it_knows_when_it_can_combine_itself_with_another_TextPreChunk_instance  s    " )'A	
 
 
 !FGGH1
 
 
	
 &.//01
 
 
 $$^44FFFFFFr@   c           	     t   t                      }t          t          d          t          d          gd|          }t          t          d          t          d          gd|          }|                    |          }|t          t          d          t          d          t          d          t          d          gd|          k    sJ |t          t          d          t          d          gd|          k    sJ |t          t          d          t          d          gd|          k    sJ dS )	z.combine() produces a new pre-chunk by appending the elements of `other_pre-chunk`.

        Note that neither the original or other pre_chunk are mutated.
        r   r   zfeugiat efficitur.r   z/Donec semper facilisis metus finibus malesuada.zCVivamus magna nibh, blandit eu dui congue, feugiat efficitur velit.zporta volupat.N)r   r   r   combine)r/   r>   r   r  new_pre_chunks        r0   8it_can_combine_itself_with_another_TextPreChunk_instancezMDescribeTextPreChunk.it_can_combine_itself_with_another_TextPreChunk_instance  s   
    NOOBCC 0
 
 
	 'FGGZ[[ ,
 
 
 "))/:: NOOBCCFGGZ[[	 0	!
 	!
 	!
 	
 	
 	
 	
 LNOOBCC 0
 
 
 
 
 
 
 ,FGGZ[[ ,#
 #
 #
 
 
 
 
 
 
r@   c                   t          d          t          d          g}t          dd          }t          |d|          }|                                }t          |          }|t          d          k    sJ |j        |j        u sJ |j        j	        |k    sJ t          j        t                    5  t          |           d d d            d S # 1 swxY w Y   d S )	Nr   zcLorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed lectus porta volutpat.rN   Tr   r"  r   ze feugiat efficitur.

Introduction

Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed lectus porta volutpat.)r   r   r   r   r   ry   r   r   _consolidated_metadatar   r+   r,   r|   r/   r}   r>   r   r   r   s         r0   Lit_generates_a_single_chunk_from_its_elements_if_they_together_fit_in_windowzaDescribeTextPreChunk.it_generates_a_single_chunk_from_its_elements_if_they_together_fit_in_window  sH   .!!* 
 cNNN :PW[\\\	**,,
Z  (L
 
 
 
 
 
 ~!AAAAA~+x7777]=)) 	 		 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   5CCCc                .   t          d          g}t          dd          }t          |d|          }|                                }t	          |          }|t          d          k    sJ |j        |j        u sJ |j        j        |k    sJ t	          |          }|t          d          k    sJ |j        |j	        u sJ |j        j        |k    sJ t          j        t                    5  t	          |           d d d            d S # 1 swxY w Y   d S )	NzLorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.rN   Tr   r   r   zLorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi utz aliquip ex ea commodo consequat.)r   r   r   r   ry   r   r   r+  r   _continuation_metadatar+   r,   r|   r,  s         r0   Ibut_it_generates_split_chunks_when_its_single_element_exceeds_window_sizez^DescribeTextPreChunk.but_it_generates_split_chunks_when_its_single_element_exceeds_window_size  s    & 
 cNNN "4HHH	**,,
 Z  (I
 
 
 
 
 

 ~!AAAAA~+x7777Z  ()KLLLLLL~!AAAAA~+x7777]=)) 	 		 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   -D

DDc                    t          dddgd          }t          t          d|          gdt          d	
                    }|                                }d |D             g dk    sJ d S )Nr   foo.docxlatf87731e0category_depthfilename	languages	parent_id0'Lorem ipsum dolor' means 'Thank you very much'.r   r   r   r)   r   c                &    g | ]}|j         j        S rF   )r   r   )r  cs     r0   r  ztDescribeTextPreChunk.and_it_adds_the_is_continuation_flag_for_second_and_later_text_split_chunks.<locals>.<listcomp>F  s    ???q
*???r@   )NTT)r   r   r   r   r   r/   r   r   r   s       r0   Kand_it_adds_the_is_continuation_flag_for_second_and_later_text_split_chunksz`DescribeTextPreChunk.and_it_adds_the_is_continuation_flag_for_second_and_later_text_split_chunks5  s    "g 	
 
 
 !DxXXXY 333	
 
 
	 **,,
??J???CUCUCUUUUUUUr@   c                (   t                      }t          t          d|          gdt                                }|                                }t          j        t                    5  t          |           d d d            d S # 1 swxY w Y   d S )Nr   r   r   )	r   r   r   r   r   r+   r,   r|   ry   r=  s       r0   >but_it_generates_no_chunks_when_the_pre_chunk_contains_no_textzSDescribeTextPreChunk.but_it_generates_no_chunks_when_the_pre_chunk_contains_no_textH  s    "$$ rH---. ""
 
 
	 **,,
]=)) 	 		 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   *BBBr   r   )z/In rhoncus ipsum sed lectus   porta volutpat.  r   r   c                ~    t          t          |          gdt          dd                    }|j        |k    sJ d S r   )r   r   r   r   r   s       r0   r   zTDescribeTextPreChunk.it_computes_its_overlap_tail_for_use_in_inter_pre_chunk_overlapU  sQ     !$ZZL/"Z^2_2_2_
 
 
	 %777777r@   c                   t          t          dt          dddgd                    t          dt          d	dd
ddg                    gdt	                                }|j        dd	gddgdgddggd
gdgdk    sJ d S )Nrt   r   r2  r3  r4  r5  r   9'Lorem ipsum dolor' means 'Thank you very much' in Latin.r   
sprite.pngengr6  r7  
image_pathr8  r   r   )r6  r7  r8  rG  r9  )r   r   r   r   r   _all_metadata_valuesr   s     r0   Cit_extracts_all_populated_metadata_values_from_the_elements_to_helpzXDescribeTextPreChunk.it_extracts_all_populated_metadata_values_from_the_elements_to_helpf  s     !,'(!+#('",	     O,'(!+#/#(%.	    (  ""-
 
 
	2 - !f#Z0 'E5>2'.$2
 2
 
 
 
 
 
 
r@   c                ,   t          dddgd          }d|_        t          ddddd	g
          }d|_        t          t	          d|          t          d|          gdt                                }|j        ddgddgdgdgdd	ggdgdk    sJ d S )Nr   r2  r3  r4  r5  gףp=
?r   rD  rE  rF  gףp=
?rt   r   r:  r   r   )r6  r7  rG  r8  r9  )r   coefficientquotientr   r   r   r   rH  )r/   r   
metadata_2r   s       r0   ;but_it_discards_ad_hoc_metadata_fields_during_consolidationzPDescribeTextPreChunk.but_it_discards_ad_hoc_metadata_fields_during_consolidation  s    "g 	
 
 
  $$#en	
 
 

 #
 mh777GR\]]]  ""
 
 
	 - !f#Z0'. 'E5>2$2
 2
 
 
 
 
 
 
r@   c                   t          d          }t          d          }t          d|          }t          d|          }t	          ||gd|	          }|j        }|j        }|J |||gk    sJ |d
         |u sJ |d         |u sJ d S )NTr   zfoo.pdf)r7  rt   r   r:  r   r   r   r   )r   r   r   r   r   r+  r   )r/   r>   r   r   	element_2r   consolidated_metadatar   s           r0   Aand_it_adds_the_pre_chunk_elements_to_metadata_when_so_instructedzVDescribeTextPreChunk.and_it_adds_the_pre_chunk_elements_to_metadata_when_so_instructed  s    T:::"I666999KV^___	 '9!5btTTT	 ) @ .;((() 44444Q7****Q9,,,,,,r@   c                .   t          t          d          t          dt          ddddgddgd	g
                    t	          dt          dddgddgdd	g                    gdt                                }|j        }|dg dg dd	dgdk    sJ dS )z._meta_kwargs is used like `ElementMetadata(**self._meta_kwargs)` to construct metadata.

        Only non-None fields should appear in the dict and each field value should be the
        consolidation of the values across the pre_chunk elements.
        r   rt   r2  r   LoremIpsumbir3  )r7  r6  emphasized_text_contentsemphasized_text_tagsr8  r   rC  zbar.docxipsumrE  )r7  rX  rY  r8  r   )rT  rU  rT  rZ  )rV  rW  rW  rV  N)r   r   r   r   r   r   _meta_kwargs)r/   r   meta_kwargss      r0   Pit_forms_ElementMetadata_constructor_kwargs_by_applying_consolidation_strategieszeDescribeTextPreChunk.it_forms_ElementMetadata_constructor_kwargs_by_applying_consolidation_strategies  s    !"!,!+'(2971C.13Z#('  
 
 
 O,!+ 3:71C.13Z#(%.
 
 
  8  ""=
 
 
	B  ,"(L(L(L$8$8$8	
 
 
 
 
 
 
 
r@   c                t   t          d          }t          d          }t          dt          t          d          g                    }t	          |||gdt          d	          
          }|j        }||||gk    sJ |d         |u sJ |d         |usJ |d         j        j        J |j        |u sJ d S )Nr   r   r   zPorta volupat.)r   r   r   Tr   r   r      )	r   r   r   r   r   r   r   r   r   )r/   r   rP  	element_3r   r   s         r0   r  zCDescribeTextPreChunk.it_computes_the_original_elements_list_to_help  s   ''RSS	$9$D9I4J4J3KLLL
 
 
	 !i+ t<<<
 
 
	 "0 )Y ????? Q7****Qy0000Q(6>>>'=888888r@   )r}   r   r9   zbah da bing.zbah da bing.

foo

barzda bang.zda bang.

foo

barzbah da boom.zbah da boom.

foozfoo

barr}   
list[Text]r   c                \    t          ||t                                }|j        |k    sJ dS )z._text is the "joined" text of the pre-chunk elements.

        The text-segment contributed by each element is separated from the next by a blank line
        ("

"). An element that contributes no text does not give rise to a separator.
        r   N)r   r   r   )r/   r}   r   r9   r   s        r0   7it_knows_the_concatenated_text_of_the_pre_chunk_to_helpzLDescribeTextPreChunk.it_knows_the_concatenated_text_of_the_pre_chunk_to_help
  s7    " !.O`O`aaa	.000000r@   N)
r  r   r  r  r	  r   r
  r  r9   rD   )r"   r%   r8   r%   r9   rD   r  )r}   ra  r   r   r9   r   )rk   rl   rm   rn   r+   ro   rp   r  r  r%  r)  r-  r0  r>  r@  r   rI  rN  rR  r]  r  r   r   rc  rF   r@   r0   r  r  t  s       PP[V UENEE5>4@%eU^T:UENEE5>5AUENEE5>5A%>	
 @ @ @ @    [J		
 		
 		
 G G G G.8
 8
 8
t  0# # #JV V V&   [" TR		
 8 8 8 8%
 %
 %
N 
  
  
D- - -".
 .
 .
`9 9 94 [8d5kk44;;'9UVd5kk99R==$$u++6
D\]immTT%[[)>;PQd5kk44;;		"6LI		
 	1 	1 	1 	1 	1 	1r@   r  c                  $    e Zd ZdZd Zd Zd ZdS )Describe_TableSplitterz@Unit-test suite for `unstructured.chunking.base._TableSplitter`.c                    t          d          }t          j        d          }t          t	          j        ||                    g dk    sJ d S )Nru   r)   aP  
            <table border="1" class="dataframe">
              <tbody>
                <tr>
                  <td>Stanley
              Cups</td>
                  <td></td>
                  <td></td>
                </tr>
                <tr>
                  <td>Team</td>
                  <td>Location</td>
                  <td>Stanley Cups</td>
                </tr>
                <tr>
                  <td>Blues</td>
                  <td>STL</td>
                  <td>1</td>
                </tr>
                <tr>
                  <td>Flyers</td>
                  <td>PHI</td>
                  <td>2</td>
                </tr>
                <tr>
                  <td>Maple Leafs</td>
                  <td>TOR</td>
                  <td>13</td>
                </tr>
              </tbody>
            </table>
            ))z'Stanley Cups Team Location Stanley Cupszs<table><tr><td>Stanley Cups</td><td/><td/></tr><tr><td>Team</td><td>Location</td><td>Stanley Cups</td></tr></table>)zBlues STL 1 Flyers PHI 2zj<table><tr><td>Blues</td><td>STL</td><td>1</td></tr><tr><td>Flyers</td><td>PHI</td><td>2</td></tr></table>)zMaple Leafs TOR 13zC<table><tr><td>Maple Leafs</td><td>TOR</td><td>13</td></tr></table>r   r   from_html_textr   r   iter_subtablesr/   r>   r   s      r0   =it_splits_an_HTML_table_on_whole_row_boundaries_when_possiblezTDescribe_TableSplitter.it_splits_an_HTML_table_on_whole_row_boundaries_when_possible'  st    s444-!
 !

F N1*dCCDD I
 I
 I
 
 
 
 
 
 
r@   c                    t          d          }t          j        d          }t          t	          j        ||                    g dk    sJ d S )Nr   r)   a.  
            <html><body><table>
              <tr>
                <td>Lorem ipsum dolor sit amet.</td>
                <td>   Consectetur    adipiscing     elit.   </td>
                <td>
                  Laboris nisi ut
                  aliquip ex ea commodo.
                </td>
              </tr>
              <tr>
                <td>Duis</td>
                <td>Dolor</td>
              </tr>
              <tr>
                <td>Duis</td>
                <td>Cillum</td>
              </tr>
            </table></body></html>
            ))z8Lorem ipsum dolor sit amet. Consectetur adipiscing elit.za<table><tr><td>Lorem ipsum dolor sit amet.</td><td>Consectetur adipiscing elit.</td></tr></table>)z&Laboris nisi ut aliquip ex ea commodo.zG<table><tr><td>Laboris nisi ut aliquip ex ea commodo.</td></tr></table>)zDuis Dolor Duis CillumzX<table><tr><td>Duis</td><td>Dolor</td></tr><tr><td>Duis</td><td>Cillum</td></tr></table>rg  rj  s      r0   Eand_it_splits_an_oversized_row_on_an_even_cell_boundary_when_possiblez\Describe_TableSplitter.and_it_splits_an_oversized_row_on_an_even_cell_boundary_when_possiblea  ss    s444-
 

. N1*dCCDD I
 I
 I
 
 
 
 
 
 
r@   c                    t          d          }t          j        d          }t          t	          j        ||                    g dk    sJ d S )Nr   r)   a  
            <table>
              <thead>
                <tr>
                  <td>
                    Lorem ipsum dolor sit amet,
                    consectetur adipiscing elit.
                    Sed do eiusmod tempor
                    incididunt ut labore et dolore magna aliqua.
                  </td>
                  <td> Ut enim ad minim veniam.           </td>
                  <td> Quis nostrud exercitation ullamco. </td>
                </tr>
              </thead>
              <tbody>
                <tr><td>Duis aute irure dolor</td></tr>
                <tr><td>In reprehenderit voluptate.</td></tr>
              </tbody>
            </table
            ))z?Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed doz`<table><tr><td>Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do</td></tr></table>)z;eiusmod tempor incididunt ut labore et dolore magna aliqua.z\<table><tr><td>eiusmod tempor incididunt ut labore et dolore magna aliqua.</td></tr></table>)z;Ut enim ad minim veniam. Quis nostrud exercitation ullamco.zd<table><tr><td>Ut enim ad minim veniam.</td><td>Quis nostrud exercitation ullamco.</td></tr></table>)z1Duis aute irure dolor In reprehenderit voluptate.zc<table><tr><td>Duis aute irure dolor</td></tr><tr><td>In reprehenderit voluptate.</td></tr></table>rg  rj  s      r0   8and_it_splits_an_oversized_cell_on_an_even_word_boundaryzODescribe_TableSplitter.and_it_splits_an_oversized_cell_on_an_even_word_boundary  ss    s444-
 

. N1*dCCDD I
 I
 I
 
 
 
 
 
 
r@   N)rk   rl   rm   rn   rk  rm  ro  rF   r@   r0   re  re  $  sI        JJ8
 8
 8
t,
 ,
 ,
\4
 4
 4
 4
 4
r@   re  c                      e Zd ZdZd Zd Zd Zej        	                    dddg          dd	            Z
d
 Zej        	                    dddg          dd            ZdS )Describe_TextSplitterzGUnit-test suite for `unstructured.chunking.base._TextSplitter` objects.c                    t          ddd          }t          |          }d} ||          \  }}|dk    sJ |dk    sJ  ||          \  }}|dk    sJ |dk    sJ d S )	Nr4   r   r]   r"   r   rP   zULorem ipsum dolor amet consectetur adipiscing.  
  In rhoncus ipsum sed lectus porta..Lorem ipsum dolor amet consectetur adipiscing.z,ipiscing. In rhoncus ipsum sed lectus porta.r   r   r   r/   r>   splitr   s	remainders         r0   .it_splits_on_a_preferred_separator_when_it_canzDDescribe_TextSplitter.it_splits_on_a_preferred_separator_when_it_can  s    bKacdddd##1 	
 uT{{9 DDDDD
 JJJJJuY''9BBBBBBr@   c                    t          ddd          }t          |          }d} ||          \  }}|dk    sJ |dk    sJ  ||          \  }}|dk    sJ |d	k    sJ  ||          \  }}|d	k    sJ |d
k    sJ d S )N(   r   r]   rs  zZLorem ipsum dolor amet consectetur adipiscing. In rhoncus ipsum sed lectus porta volutpat.z"Lorem ipsum dolor amet consecteturzAnsectetur adipiscing. In rhoncus ipsum sed lectus porta volutpat.z&nsectetur adipiscing. In rhoncus ipsumz$cus ipsum sed lectus porta volutpat.r   ru  rv  s         r0   Mand_it_splits_on_the_next_available_separator_when_the_first_is_not_availablezcDescribe_TextSplitter.and_it_splits_on_the_next_available_separator_when_the_first_is_not_available  s    bKacdddd## 	
 uT{{988888_____uY''9<<<<<BBBBBuY''9:::::Br@   c                    t          ddd          }t          |          }d} ||          \  }}|dk    sJ |dk    sJ  ||          \  }}|dk    sJ |d	k    sJ  ||          \  }}|d
k    sJ |dk    sJ d S )Nr   r   r]   rs  PLoremipsumdolorametconsecteturadipiscingelit. In rhoncus ipsum sed lectus porta.Loremipsumdolorametconsecteturz<onsecteturadipiscingelit. In rhoncus ipsum sed lectus porta.zonsecteturadipiscingelit. Inz)gelit. In rhoncus ipsum sed lectus porta.zgelit. In rhoncus ipsum sedzipsum sed lectus porta.ru  rv  s         r0   8and_it_splits_on_an_arbitrary_character_as_a_last_resortzNDescribe_TextSplitter.and_it_splits_on_an_arbitrary_character_as_a_last_resort  s    bKacdddd##auT{{944444ZZZZZuY''922222GGGGGuY''9111115555555r@   r   rt  zLorem ipsum dolor.r   c                    t          dd          }t          |          } ||          \  }}||k    sJ |dk    sJ d S )N.   r]   rO   r   ru  )r/   r   r>   rw  rx  ry  s         r0   9it_does_not_split_a_string_that_is_not_longer_than_maxlenzODescribe_TextSplitter.it_does_not_split_a_string_that_is_not_longer_than_maxlen  sR     b"===d##uT{{9DyyyyBr@   c                    t          dd          }t          |          }d} ||          \  }}|dk    sJ t          |          dk    sJ d S )N&   r]   rO   r  &Loremipsumdolorametconsecteturadipisci)r   r   r  )r/   r>   rw  r   rx  _s         r0   Eit_fills_the_window_when_falling_back_to_an_arbitrary_character_splitz[Describe_TextSplitter.it_fills_the_window_when_falling_back_to_an_arbitrary_character_split  s`    b"===d##auT{{1<<<<<1vv||||||r@   
separatorsr   )r   Sequence[str]c                    t          d|d          }t          |          }d} ||          \  }}|dk    sJ |dk    sJ d S )Nr4   r]   rs  zPLorem ipsum dolor amet consectetur adipiscing.   

 In rhoncus ipsum sed lectus.rt  z&ipiscing. In rhoncus ipsum sed lectus.ru  )r/   r  r>   rw  r   rx  ry  s          r0   %it_strips_whitespace_around_the_splitz;Describe_TextSplitter.it_strips_whitespace_around_the_split  sf    bJ`bcccd##c uT{{9DDDDDDDDDDDDr@   N)r   r   )r  r  )rk   rl   rm   rn   rz  r}  r  r+   ro   rp   r  r  r  rF   r@   r0   rq  rq    s        QQ  ,  (6 6 6" [< 	
       [\K+@AA	E 	E 	E BA	E 	E 	Er@   rq  c                      e Zd ZdZd Zd Zej                            dg d          dd
            Z	ej                            dg d          dd            Z
d Zd Zd ZdS )Describe_CellAccumulatorzBUnit-test suite for `unstructured.chunking.base._CellAccumulator`.c                @    t          d          }|j        g k    sJ d S Nr   maxlen)r   _cellsr/   accums     r0   r   z4Describe_CellAccumulator.it_is_empty_on_construction.  s+     ,,,|r!!!!!!r@   c                    t          d          }t          |          }t          d          }|                    |           |j        |gk    sJ d S )Nz<td>foobar</td>r   r  )r   r   r   add_cellr  )r/   tdcellr  s       r0   r   z<Describe_CellAccumulator.it_accumulates_elements_added_to_it3  sX     !233|| ,,,t|v%%%%%%r@   )	cell_htmlr9   )z<td/>Tz<td>Lorem Ipsum.</td>T)z<td>Lorem Ipsum dolor sit.</td>Tz$<td>Lorem Ipsum dolor sit amet.</td>Fr  r   r9   rD   c                    t          d          }t          t          |                    }|                    |          |u sJ dS )zCell text must be 22-chars or shorter to fit in 55-char window.

        `<table><tr><td>...</td></tr></table>` overhead is 33 characters.
        7   r  N)r   r   r   r   r/   r  r9   r  r  s        r0   Dit_will_fit_a_cell_with_text_shorter_than_maxlen_minus_33_when_emptyz]Describe_CellAccumulator.it_will_fit_a_cell_with_text_shorter_than_maxlen_minus_33_when_empty<  sM      !++++I6677~~d##~555555r@   )r  r  )z<td>Lorem Ipsum amet.</td>T)z<td>Lorem Ipsum dolor.</td>Fr  c                    t          d          }|                    t          t          d                               t          t          |                    }|                    |          |u sJ dS )zCell text must be 9-chars shorter than remaining space to fit with accumulated cells.

        `<td>...</td>` overhead is 9 characters.
        U   r  z#<td>abcdefghijklmnopqrstuvwxyz</td>N)r   r  r   r   r   r  s        r0   Tand_it_will_fit_a_cell_with_text_shorter_than_remaining_space_minus_9_when_not_emptyzmDescribe_CellAccumulator.and_it_will_fit_a_cell_with_text_shorter_than_remaining_space_minus_9_when_not_emptyQ  sq    " !+++x 34Y Z Z[[\\\+I6677~~d##~555555r@   c                   t          d          }|                    t          t          d                               t	          |                                          \  }}|dk    sJ |dk    sJ |j        g k    sJ d S )Nr   r  <td>abcde fghij klmno</td>abcde fghij klmno2<table><tr><td>abcde fghij klmno</td></tr></table>r   r  r   r   ry   r   r  r/   r  r   htmls       r0   Git_generates_a_TextAndHtml_pair_and_resets_itself_to_empty_when_flushedz`Describe_CellAccumulator.it_generates_a_TextAndHtml_pair_and_resets_itself_to_empty_when_flushedi  s     ,,,x 34P Q QRRSSS%++--((
d*****KKKKK|r!!!!!!r@   c                d   t          d          }|                    t          t          d                               |                    t          t          d                               t	          |                                          \  }}|dk    sJ |dk    sJ |j        g k    sJ d S )Nr   r  r  z<td>pqrst uvwxy z</td>abcde fghij klmno pqrst uvwxy zzH<table><tr><td>abcde fghij klmno</td><td>pqrst uvwxy z</td></tr></table>r  r  s       r0   7and_the_HTML_contains_as_many_cells_as_were_accumulatedzPDescribe_CellAccumulator.and_the_HTML_contains_as_many_cells_as_were_accumulateds  s     ,,,x 34P Q QRRSSSx 34L M MNNOOO%++--((
d88888aaaaa|r!!!!!!r@   c                    t          d          }t          j        t                    5  t	          |                                           d d d            d S # 1 swxY w Y   d S r  )r   r+   r,   r|   ry   r   r  s     r0   6but_it_does_not_generate_a_TextAndHtml_pair_when_emptyzODescribe_CellAccumulator.but_it_does_not_generate_a_TextAndHtml_pair_when_empty~  s     ,,,]=)) 	  	 	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	    "AA AN)r  r   r9   rD   )rk   rl   rm   rn   r   r   r+   ro   rp   r  r  r  r  r  rF   r@   r0   r  r  +  s        LL" " "
& & & ['	
 	
 	
 
6 
6 
6 
6 ['	
 	
 	
	 	6 6 6	 	6" " "	" 	" 	"         r@   r  c                      e Zd ZdZd Zd Zej                            dg d          dd
            Z	ej                            dg d          dd            Z
d Zd Zd ZdS )Describe_RowAccumulatorzAUnit-test suite for `unstructured.chunking.base._RowAccumulator`.c                @    t          d          }|j        g k    sJ d S r  )r   _rowsr  s     r0   r   z3Describe_RowAccumulator.it_is_empty_on_construction  s+    s+++{b      r@   c                    t          d          }t          t          d                    }|                    |           |j        |gk    sJ d S )Nr   r  z!<tr><td>foo</td><td>bar</td></tr>)r   r   r   add_rowr  )r/   r  rows      r0   it_accumulates_rows_added_to_itz7Describe_RowAccumulator.it_accumulates_rows_added_to_it  sV    s+++)*MNNOOc{se######r@   )row_htmlr9   )z<tr/>Tz<tr><td/></tr>T)z<tr><td>Lorem Ipsum.</td></tr>Tz(<tr><td>Lorem Ipsum dolor sit.</td></tr>Tz(<tr><td>Lorem</td><td>Sit amet</td></tr>T)z-<tr><td>Lorem Ipsum dolor sit amet.</td></tr>Fz0<tr><td>Lorem Ipsum</td><td>Dolor sit.</td></tr>Fr  r   r9   rD   c                    t          d          }t          t          |                    }|                    |          |u sJ dS )zRow HTML must be 40-chars or shorter to fit in 55-char chunking window.

        `<table>...</table>` overhead is 15 characters.
        r  r  N)r   r   r   r   r/   r  r9   r  r  s        r0   Cit_will_fit_a_row_with_HTML_shorter_than_maxlen_minus_15_when_emptyz[Describe_RowAccumulator.it_will_fit_a_row_with_HTML_shorter_than_maxlen_minus_15_when_empty  sM    &  r***)(3344~~c""n444444r@   )r  r  )z'<tr><td>Lorem Ipsum dolor sit</td></tr>Tr  r  )z)<tr><td>Lorem</td><td>Sit amet.</td></tr>Fr  c                    t          d          }|                    t          t          d                               t          t          |                    }|                    |          |u sJ dS )z9There is no overhead beyond row HTML for additional rows.r   r  z,<tr><td>abcdefghijklmnopqrstuvwxyz</td></tr>N)r   r  r   r   r   r  s        r0   Kand_it_will_fit_a_row_with_HTML_shorter_than_remaining_space_when_not_emptyzcDescribe_RowAccumulator.and_it_will_fit_a_row_with_HTML_shorter_than_remaining_space_when_not_empty  sq       r***g12`aabbccc)(3344~~c""n444444r@   c                   t          d          }|                    t          t          d                               t	          |                                          \  }}|dk    sJ |dk    sJ |j        g k    sJ d S )Nr   r  #<tr><td>abcde fghij klmno</td></tr>r  r  r   r  r   r   ry   r   r  r  s       r0   r  z_Describe_RowAccumulator.it_generates_a_TextAndHtml_pair_and_resets_itself_to_empty_when_flushed  s    s+++g12WXXYYZZZ%++--((
d*****KKKKK{b      r@   c                d   t          d          }|                    t          t          d                               |                    t          t          d                               t	          |                                          \  }}|dk    sJ |dk    sJ |j        g k    sJ d S )Nr   r  r  z<tr><td>pqrst uvwxy z</td></tr>r  zQ<table><tr><td>abcde fghij klmno</td></tr><tr><td>pqrst uvwxy z</td></tr></table>r  r  s       r0   6and_the_HTML_contains_as_many_rows_as_were_accumulatedzNDescribe_RowAccumulator.and_the_HTML_contains_as_many_rows_as_were_accumulated  s    s+++g12WXXYYZZZg12STTUUVVV%++--((
d88888
 
 
 
 {b      r@   c                    t          d          }t          j        t                    5  t	          |                                           d d d            d S # 1 swxY w Y   d S r  )r   r+   r,   r|   ry   r   r  s     r0   r  zNDescribe_RowAccumulator.but_it_does_not_generate_a_TextAndHtml_pair_when_empty  s    s+++]=)) 	  	 	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	 r  N)r  r   r9   rD   )rk   rl   rm   rn   r   r  r+   ro   rp   r  r  r  r  r  rF   r@   r0   r  r    s        KK! ! !
$ $ $ [&	
 	
 	
 
5 
5 
5 
5 [&	
 	
 	
 	5 	5 	5 	5! ! !! ! !          r@   r  c                  0    e Zd ZdZd Zd Zd Zd Zd ZdS )DescribePreChunkCombinerzBUnit-test suite for `unstructured.chunking.base.PreChunkCombiner`.c           	        t          dd          }t          t          d          t          d          gd|          t          t          d          t          d          gd|          t          t          d	          t          d
          gd|          g}t	          ||                                          }t          |          }t          |t                    sJ |j        t          d          t          d          t          d          t          d          t          d	          t          d
          gk    sJ t          j
        t                    5  t          |           d d d            d S # 1 swxY w Y   d S )N   r"   r8   rt   r   r   r   
Mauris Nec;Mauris nec urna non augue vulputate consequat eget et nisi.Sed Orci?Sed orci quam, eleifend sit amet vehicula, elementum ultricies.rw   r   r   r   r   r	   iter_combined_pre_chunksry   rz   r{   r+   r,   r|   r/   r>   r   r~   r   s        r0   ,it_combines_sequential_small_text_pre_chunkszEDescribePreChunkCombiner.it_combines_sequential_small_text_pre_chunks  s   ccRRR-((RSS  "   ,''VWW  "   *%%Z[[  "  #

6 **4@@@YY[[((	)\22222"-  JKK,NOO*RSS'
 
 
 
 
 ]=)) 	! 	!   	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	!s   E;;E?E?c                   t          dd          }t          t          d          t          d          gd|          t	          t          d          d|          t          t          d          t          d	          gd|          g}t          |t          dd                                                    }t          |          }t          |t                    sJ |j
        t          d          t          d          gk    sJ t          |          }t          |t                    sJ |j        t          d          k    sJ t          |          }t          |t                    sJ |j
        t          d          t          d	          gk    sJ t          j        t                    5  t          |           d d d            d S # 1 swxY w Y   d S )
Nr  r  rt   r   r   r   r   r  r  )r   r   r   r   r   r   r	   r  ry   rz   r{   r   r+   r,   r|   r  s        r0   (but_it_does_not_combine_table_pre_chunkszADescribePreChunkCombiner.but_it_does_not_combine_table_pre_chunks  sQ   ccRRR-((RSS  "   % 455btTTT,''VWW  "  

( *sWZ[[[
 

"
"
$
$ 	 ((	)\22222"-  JKK'
 
 
 
 

 ((	)]333335)=#>#>>>>>((	)\22222",NOO'
 
 
 
 

 ]=)) 	! 	!   	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	!s   4GGGc                v   t          dd          }t          t          d          t          d          gd|          t          t          d          t          d	          gd|          t          t          d
          t          d          gd|          g}t	          ||                                          }t          |          }t          |t                    sJ |j        t          d          t          d          t          d          t          d	          gk    sJ t          |          }t          |t                    sJ |j        t          d
          t          d          gk    sJ t          j
        t                    5  t          |           d d d            d S # 1 swxY w Y   d S )Nr  P   r  rt   r   r   r   r  r  r  r  rw   r  r  s        r0   /it_respects_the_specified_combination_thresholdzHDescribePreChunkCombiner.it_respects_the_specified_combination_thresholdH  s0   cbQQQ-((RSS  "   ,''VWW  "   *%%Z[[  "  %

8 **4@@@YY[[((	)\22222"-  JKK,NOO	'
 
 
 
 
 ((	)\22222"*RSS'
 
 
 
 

 ]=)) 	! 	!   	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	!   F..F25F2c                v   t          dd          }t          t          d          t          d          gd|          t          t          d          t          d          gd|          t          t          d	          t          d
          gd|          g}t	          ||                                          }t          |          }t          |t                    sJ |j        t          d          t          d          t          d          t          d          gk    sJ t          |          }t          |t                    sJ |j        t          d	          t          d
          gk    sJ t          j
        t                    5  t          |           d d d            d S # 1 swxY w Y   d S )NrN   r  rt   r   r   r   r  r  r  r  rw   r  r  s        r0   *it_respects_the_hard_maximum_window_lengthzCDescribePreChunkCombiner.it_respects_the_hard_maximum_window_length{  s0   ccRRR-((RSS  "   ,''VWW  "   *%%Z[[  "  %

: **4@@@YY[[((	)\22222"-  JKK,NOO	'
 
 
 
 
 ((	)\22222"*RSS'
 
 
 
 

 ]=)) 	! 	!   	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	!r  c                P   t          dd          }t          t          d          gd|          t          t          d          gd|          t          t          d          gd|          g}t	          |t          dd                                                    }t          |          }t          |t                    sJ |j        t          d          gk    sJ t          |          }t          |t                    sJ |j        t          d          gk    sJ t          |          }t          |t                    sJ |j        t          d          gk    sJ t          j
        t                    5  t          |           ddd           dS # 1 swxY w Y   dS )	z=Such as occurs when a single element exceeds the window size.ru   r  rt   r   r   zLorem ipsum dolor sit amet consectetur adipiscing elit. Mauris nec urna non augue vulputate consequat eget et nisi. Sed orci quam, eleifend sit amet vehicula, elementum ultricies.zVulputate ConsequatNr  r  s        r0   3it_accommodates_and_isolates_an_oversized_pre_chunkzLDescribePreChunkCombiner.it_accommodates_and_isolates_an_oversized_pre_chunk  s   ccRRR%../NNN[   "
 
 
 % 5667QUVVV

  *sWZ[[[
 

"
"
$
$ 	 ((	)\22222"u]';';&<<<<<((	)\22222"S '
 
 
 
 
 ((	)\22222"u-B'C'C&DDDDD]=)) 	! 	!   	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	!s   >FF"FN)	rk   rl   rm   rn   r  r  r  r  r  rF   r@   r0   r  r    si        LL*! *! *!X-! -! -!^1! 1! 1!f2! 2! 2!h*! *! *! *! *!r@   r  c                      e Zd ZdZd Zd ZdS )DescribeTextPreChunkAccumulatorzIUnit-test suite for `unstructured.chunking.base.TextPreChunkAccumulator`.c           	        t          d          }t          |          }t          t          d          t	          d          gd|          }|                    |          sJ |                    |           t          t          d          t	          d	          gd
|          }|                    |          sJ |                    |           t          t          d          t	          d          gd|          }|                    |          sJ |                    |           |                                }t          |          }t          j
        t                    5  t          |           d d d            n# 1 swxY w Y   t          |t                    sJ |j        t          d          t	          d          t          d          t	          d	          t          d          t	          d          gk    sJ |j        dk    sJ t          j
        t                    5  t          |                                           d d d            d S # 1 swxY w Y   d S )Ni  r=   rw   rt   r   z
elementum.r   r  r  z	sit amet.r  zDSed orci quam, eleifend sit amet vehicula, elementum ultricies quam.z
consequat.)r   r   r   r   r   r   add_pre_chunkr   ry   r+   r,   r|   rz   r{   _overlap_prefix)r/   r>   r  r   r~   s        r0   Lit_generates_a_combined_TextPreChunk_when_flushed_and_resets_itself_to_emptyzlDescribeTextPreChunkAccumulator.it_generates_a_combined_TextPreChunk_when_flushed_and_resets_itself_to_empty  s   #>>>'T222 m$$NOO (
 
 
	 ~~i(((((I&&& l##RSS '
 
 
	 ~~i(((((I&&& j!![\\ (
 
 
	 ~~i(((((I&&& ((	]=)) 	! 	!   	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! )\22222"-  JKK,NOO*WXX'
 
 
 
 
 (L8888]=)) 	  	 	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	 s$   +FFF0"II#&I#c                    t          t          d                    }t          |                                          g k    sJ d S )Nru   r)   rw   )r   r   r   r   r  s     r0   ;but_it_does_not_generate_a_TextPreChunk_on_flush_when_emptyz[DescribeTextPreChunkAccumulator.but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty  sD    '_C-P-P-PQQQEKKMM""b((((((r@   N)rk   rl   rm   rn   r  r  rF   r@   r0   r  r    s9        SS9  9  9 v) ) ) ) )r@   r  c            
          e Zd ZdZej                            d ed           ed ed                    g          dd	            Z	d
 Z
d Zd Zd Zd ZdS )Describe_is_on_next_pagea"  Unit-test suite for `unstructured.chunking.base.is_on_next_page()` function.

    `is_on_next_page()` is not itself a predicate, rather it returns a predicate on Element
    (`Callable[[Element], bool]`) that can be called repeatedly to detect section changes in an
    element stream.
    r   r   efgh   page_numberr   r   c                <    t                      } ||          rJ dS )z-The first page never represents a page-break.N)r   )r/   r   preds      r0   1it_is_unconditionally_false_for_the_first_elementzJDescribe_is_on_next_page.it_is_unconditionally_false_for_the_first_element,  s)    
   4==     r@   c                    t                      } |t          dt          d                              rJ  |t          d                    rJ  |t          d                    rJ dS )zMAn element with a `None` page-number is assumed to continue the current page.r   r   r  r   r  ijklNr   r   r   r/   r  s     r0   2it_is_false_for_an_element_that_has_no_page_numberzKDescribe_is_on_next_page.it_is_false_for_an_element_that_has_no_page_number4  sy      4Vo!.L.L.LMMMNNNNN4V%%%%%4V%%%%%%%r@   c                2   t                      } |t          dt          d                              rJ  |t          d                    rJ  |t          dt          d                              rJ  |t          d                    rJ d S )Nr   r   r  r   r  r  mnopr  r  s     r0   7it_is_false_for_an_element_with_the_current_page_numberzPDescribe_is_on_next_page.it_is_false_for_an_element_with_the_current_page_number;  s      4Vo!.L.L.LMMMNNNNN4V%%%%%4Vo!.L.L.LMMMNNNNN4V%%%%%%%r@   c                    t                      } |t          d                    rJ  |t          dt          d                              rJ d S )Nr   r  r   r  r   r  r  s     r0   Cit_assigns_page_number_1_to_a_first_element_that_has_no_page_numberz\Describe_is_on_next_page.it_assigns_page_number_1_to_a_first_element_that_has_no_page_numberB  s]      4V%%%%%4Vo!.L.L.LMMMNNNNNNNr@   c                    t                      } |t          dt          d                              rJ  |t          dt          d                              sJ d S )Nr   r   r  r   r  r_  r  r  s     r0   @it_is_true_for_an_element_with_an_explicit_different_page_numberzYDescribe_is_on_next_page.it_is_true_for_an_element_with_an_explicit_different_page_numberG  so      4Vo!.L.L.LMMMNNNNNtD/a*H*H*HIIIJJJJJJJr@   c                   t                      } |t          dt          d                              rJ  |t          dt          d                              sJ  |t          dt          d                              rJ  |t          d                    rJ  |t          d	t          d
                              sJ d S )Nr   r  r  r   r  r_  r  r  qrst   r  r  s     r0   2and_it_is_true_even_when_that_page_number_is_lowerzKDescribe_is_on_next_page.and_it_is_true_even_when_that_page_number_is_lowerL  s      4Vo!.L.L.LMMMNNNNNtD/a*H*H*HIIIJJJJJ4Vo!.L.L.LMMMNNNNN4V%%%%%tD/a*H*H*HIIIJJJJJJJr@   Nr   )rk   rl   rm   rn   r+   ro   rp   r   r   r  r  r  r  r   r  rF   r@   r0   r  r  $  s          [DDLL$$vTU8V8V8V"W"W"WX ! ! ! !
& & && & &O O O
K K K
K K K K Kr@   r  c                      e Zd ZdZd Zej                            d ed           e	d           e
d          g          d
d            Zd	S )Describe_is_titlezFUnit-test suite for `unstructured.chunking.base.is_title()` predicate.c                B    t          t          d                    sJ d S )Nr   )r   r   r6   s    r0   it_is_true_for_a_Title_elementz0Describe_is_title.it_is_true_for_a_Title_elementX  s"    f&&&&&&&r@   r   r   r   r   r   c                (    t          |          rJ d S )N)r   )r/   r   s     r0   -and_it_is_false_for_any_other_element_subtypez?Describe_is_title.and_it_is_false_for_any_other_element_subtype[  s     G$$$$$$$r@   Nr   )rk   rl   rm   rn   r  r+   ro   rp   r   r   r   r
  rF   r@   r0   r  r  U  s        PP' ' ' [IbMMEKLLDLL	
 % % % % % %r@   r  )2rn   
__future__r   typingr   r   r+   	lxml.htmlr   unstructured.chunking.baser   r   r	   r
   r   r   r   r   r   r   r   r   r   unstructured.common.html_tabler   r   r   unstructured.documents.elementsr   r   r   r   r   r   r   r   r   r!   rr   r   r   r  re  rq  r  r  r  r  r  r  rF   r@   r0   <module>r     s   C B " " " " " "                  ) ) ) ) ) )                              H G G G G G G G G G
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
"p: p: p: p: p: p: p: p:p+! +! +! +! +! +! +! +!\e. e. e. e. e. e. e. e.Zq9 q9 q9 q9 q9 q9 q9 q9hh1 h1 h1 h1 h1 h1 h1 h1`_
 _
 _
 _
 _
 _
 _
 _
DbE bE bE bE bE bE bE bEJW  W  W  W  W  W  W  W t]  ]  ]  ]  ]  ]  ]  ] Jo! o! o! o! o! o! o! o!d@) @) @) @) @) @) @) @)P.K .K .K .K .K .K .K .Kb% % % % % % % % % %r@   