
    NgEF                       d Z ddlmZ ddlmZmZ ddlZddlmZm	Z	m
Z
 ddlmZ ddlmZmZ ddlmZ dd	lmZmZmZmZmZmZmZmZmZ dd
lmZ d Zd Zd Z d Z!d Z"d Z#d Z$d Z%d Z&d Z'd Z( G d d          Z) G d d          Z*dS )z8Test suite for the `unstructured.chunking.title` module.    )annotations)AnyOptionalN)FixtureRequestMockfunction_mock)CHUNK_MULTI_PAGE_DEFAULT)_ByTitleChunkingOptionschunk_by_title)CoordinateSystem)	CheckBoxCompositeElementCoordinatesMetadataElementElementMetadataListItemTableTextTitle)partition_htmlc                     t          d          t          d          g} t          | d          }|t          d          t          d          t          d          gk    sJ d S )NIntroductionzcLorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed lectus porta volutpat.2   max_charactersz1Lorem ipsum dolor sit amet consectetur adipiscingz1elit. In rhoncus ipsum sed lectus porta volutpat.)r   r   r   r   elementschunkss     a/var/www/html/ai-engine/env/lib/python3.11/site-packages/test_unstructured/chunking/test_title.py3test_it_splits_a_large_element_into_multiple_chunksr    $   s    n	
 	
H HR888F((LMMLMM          c                 D   t          d          t          d          t          d          t          d          t          d          t          d          t          d          t          d          t          d	          t          d
          t                      g} t	          | dd          }t          |          dk    sJ |d         }t          |t                    sJ |j        j	        t          d          t          d          t          d          gk    sJ |d         }t          |t                    sJ |j        j	        t          d          gk    sJ |d         }t          |t                    sJ |j        j	        t          d          t          d          t          d          gk    sJ |d         }t          |t                    sJ |j        j	        t          d          t          d	          t          d
          t                      gk    sJ d S )NA Great DayToday is a great day.It is sunny outside.Heading
Cell textAn Okay DayToday is an okay day.It is rainy outside.	A Bad DayToday is a bad day.It is storming outside.r   Tcombine_text_under_n_charsinclude_orig_elements            )
r   r   r   r   r   len
isinstancer   metadataorig_elements)r   r   chunks      r   *test_it_splits_elements_by_title_and_tabler9   6   s5   m$%%#$$"##m$%%#$$k"##&''

H HZ^___Fv;;!1IEe-.....>'m$%%#$$,     1IEeU#####>'E2F,G,G+HHHHH1IEe-.....>'m$%%#$$,     1IEe-.....>'k"##&''

	,      r!   c                 j   t          dt          dg                    t          dt          dg                    t          d          t          d          t          d	          t          d
          t          d          t          d          t          d          t          d          t	                      g} t          | dd          }|t          d          t          d          t          d          t          d          gk    sJ |d         j        t          ddg          k    sJ d S )Nr#   Day)emphasized_text_contentsr6   r$   dayr%   r&   r'   r(   r)   r*   r+   r,   r   Fr-   8A Great Day

Today is a great day.

It is sunny outside.8An Okay Day

Today is an okay day.

It is rainy outside.7A Bad Day

Today is a bad day.

It is storming outside.)r   r   r   r   r   r   r   r6   r   s     r   test_chunk_by_titlerB   g   sI   moPUw&W&W&WXXX$Y^X_/`/`/`aaa#$$"##m$%%#$$k"##&''

H HZ_```FJ	
 	
 	"##WXXI	
 	
	 	 	 	 	 !95RW.!Y!Y!YYYYYYYr!   c                 \   t          dt          d                    t          dt          d                    t          dt          d                    t          d          t          d	          t          d
          t          d          t          d          t          d          t          d          t	                      g} t          | dd          }|t          d          t          d          t          d          t          d          t          d          gk    sJ d S )Nr#   r1   page_numberr=   r$   r2   r%   r&   r'   r(   r)   r*   r+   r,   Fr   multipage_sectionsr.   z+Today is a great day.

It is sunny outside.r@   rA   r   r   r   r   r   r   r   r   s     r   ,test_chunk_by_title_separates_by_page_numberrI      s>   mo!&D&D&DEEE$1/M/M/MNNN#o!.L.L.LMMM"##m$%%#$$k"##&''

H H[\]]]F	
 	
 	;	
 	
 	"##WXXI	
 	
      r!   c                 @   t          dt          d                    t          dt          d                    t          dt          d                    t          d          t          d	          t          d
          t          d          t          d          t          d          t          d          t	                      g} t          | dd          }|t          d          t          d          t          d          t          d          gk    sJ d S Nr#   r1   rD   r=   r$   r2   r%   r&   r'   r(   r)   r*   r+   r,   Tr   rF   r?   r@   rA   rH   r   s     r   &test_chuck_by_title_respects_multipagerL      s,   mo!&D&D&DEEE$1/M/M/MNNN#o!.L.L.LMMM"##m$%%#$$k"##&''

H HZ[\\\FJ	
 	
 	"##WXXI	
 	
	 	 	 	 	 	 	r!   c                 @   t          dt          d                    t          dt          d                    t          dt          d                    t          d          t          d	          t          d
          t          d          t          d          t          d          t          d          t	                      g} t          | dd          }|t          d          t          d          t          d          t          d          gk    sJ d S rK   rH   r   s     r   'test_chunk_by_title_groups_across_pagesrN      s,   mo!&D&D&DEEE$1/M/M/MNNN#o!.L.L.LMMM"##m$%%#$$k"##&''

H HZ[\\\FJ	
 	
 	"##WXXI	
 	
	 	 	 	 	 	 	r!   c                     d} t          | d          }t          |           }t          |          }||k    sJ ||k    sJ d S )N example-docs/example-10k-1p.htmlby_title)chunking_strategy)r   r   )filenamechunk_elementsr   r   s       r   ,test_add_chunking_strategy_on_partition_htmlrU      s[    1H#H
KKKNh''HH%%FX%%%%V######r!   c                 n   d} t          | dddd          }t          |           }t          |ddd          }|D ]3}t          |t                    sJ t	          |j                  dk    sJ 4|D ]3}t          |t                    sJ t	          |j                  dk    sJ 4||k    sJ ||k    sJ d S )NrP   rQ   r   r   d   )rR   r.   new_after_n_charsr   )r.   rX   r   )r   r   r5   r   r4   text)rS   rT   r   r   r8   chunk_elements         r   2test_add_chunking_strategy_respects_max_charactersr[      s   1H#$#$  N h''H#$	  F  & &%&&&&&5:#%%%%%' . .-.....=%&&#-----X%%%%V######r!   c            
     ,   t          dt          d                    t          dt          d                    t          dt          d                    t          d	t          d
                    t          dt          d                    g} t          | d          }t	          |d                   t	          t          d                    k    sJ t	          |d                   t	          t          d                    k    sJ d S )Nr#         ?)detection_class_probr=   r$   gףp=
?r%   g\(\?r'   gzG?r(   gffffff?r   r.   r?   r1   "An Okay Day

Today is an okay day.)r   r   r   r   strr   r   s     r   .test_chunk_by_title_drops_detection_class_probrb      sf   $%(  	
 	
 	
 	#$%)  	
 	
 	
 	"$%)  	
 	
 	
 	$%)  	
 	
 	
 	#$%)  	
 	
 	
3H@ HCCCFvay>>SWXX      vay>>S!12X!Y!YZZZZZZZZr!   c                 X   t          dt          t          dt          dd                                        t	          dt          t          d	t          d
d
                                        t	          dt          t          dt          dd                                        t          dt          t          dt          dd                                        t	          dt          t          dt          dd                                        g} t          | d          }t          |d                   t          t          d                    k    sJ t          |d                   t          t          d                    k    sJ d S )Nr#   ))皙?rd   )皙?rd   )rd   re   re   re   rd   )widthheight)pointssystem)coordinatesr=   r$   )rf   )333333?re   )re   rl   rl   rl   re   r%   )rm   )皙?rl   )rl   rn   rn   rn   rl   r'   r(   )ro   )r]   rn   )rn   r]   )r]   r]   rn   r   r_   r?   r1   r`   )r   r   r   r   r   r   ra   r   r   s     r   (test_chunk_by_title_drops_extra_metadatarp   '  s&   $/ ,#cBBB  
 
 
	
 	
 	
 	#$/ ,#cBBB  
 
 
	
 	
 	
 	"$/ ,#cBBB  
 
 
	
 	
 	
 	$/ ,#cBBB  
 
 
	
 	
 	
 	#$/ ,#cBBB  
 
 
	
 	
 	
sGHR HCCCFvay>>SWXX      vay>>S!12X!Y!YZZZZZZZZr!   c                     t          d          t          d          t          d          t          d          g} t          | d          }|t          d          t          d          gk    sJ dS )	zHPreChunker includes length of separators when computing remaining space.zChunking Prioritiesz"Divide text into manageable chunkszPreserve semantic boundariesz!Minimize mid-text chunk-splittings   r   zUChunking Priorities

Divide text into manageable chunks

Preserve semantic boundariesN)r   r   r   r   r   s     r   4test_it_considers_separator_length_when_pre_chunkingrs   z  s     	#$$566/00455	H HS999F/	
 	

 	<==      r!   c                      e Zd ZdZej                            dddidfddidfddidfi dfg          dd            Z ej                    dd            Z	dS )Describe_chunk_by_titlezLUnit-test suite for `unstructured.chunking.title.chunk_by_title()` function.)kwargsexpected_valuer/   TFNrv   dict[str, Any]rw   bool_chunk_by_title_r   c                T    t          g fi | |j        j        \  }}|j        |u sJ d S )N)r   	call_argsargsr/   )selfrv   rw   rz   _optss         r   ,it_supports_the_include_orig_elements_optionzDDescribe_chunk_by_title.it_supports_the_include_orig_elements_option  sC     	r$$V$$$",14)^;;;;;;r!   requestr   c                "    t          |d          S )Nz+unstructured.chunking.title._chunk_by_title)r   )r~   r   s     r   rz   z(Describe_chunk_by_title._chunk_by_title_  s    W&STTTr!   )rv   rx   rw   ry   rz   r   )r   r   )
__name__
__module____qualname____doc__pytestmarkparametrizer   fixturerz    r!   r   ru   ru     s        VV[$%t,d3%u-u5%t,d3J		
 < < < < V^U U U U U Ur!   ru   c                     e Zd ZdZej                            dddg          dd            Zd Zd Z	ej                            d	d
dg          dd            Z
d Zej                            ddddefg          dd            ZdS )Describe_ByTitleChunkingOptionszRUnit-test suite for `unstructured.chunking.title._ByTitleChunkingOptions` objects.n_charsiintc                    t          j        t          d|           5  t          j        |           d d d            d S # 1 swxY w Y   d S )Nz8'combine_text_under_n_chars' argument must be >= 0, got matchr_   r   raises
ValueErrorr
   new)r~   r   s     r   :it_rejects_combine_text_under_n_chars_for_n_less_than_zerozZDescribe_ByTitleChunkingOptions.it_rejects_combine_text_under_n_chars_for_n_less_than_zero  s    ]VWVV
 
 
 	L 	L $'7KKKK		L 	L 	L 	L 	L 	L 	L 	L 	L 	L 	L 	L 	L 	L 	L 	L 	L 	Ls   AA	Ac                @    t          d          }|j        dk    sJ dS )zSSpecifying `combine_text_under_n_chars=0` is how a caller disables chunk-combining.r   r_   N)r
   r.   r~   r   s     r   Fit_accepts_0_for_combine_text_under_n_chars_to_disable_chunk_combiningzfDescribe_ByTitleChunkingOptions.it_accepts_0_for_combine_text_under_n_chars_to_disable_chunk_combining  s,    &!DDD.!333333r!   c                    	 t          d          }n$# t          $ r t          j        d           Y nw xY w|j        dk    sJ dS )zUCaller can specify `combine_text_under_n_chars` arg without specifying other options.r   r_   z?did not accept `combine_text_under_n_chars` as option by itselfN)r
   r   r   failr.   r   s     r   Iit_does_not_complain_when_specifying_combine_text_under_n_chars_by_itselfziDescribe_ByTitleChunkingOptions.it_does_not_complain_when_specifying_combine_text_under_n_chars_by_itself  sl    	[*bIIIDD 	[ 	[ 	[KYZZZZZ	[ ."444444s    44)r.   r   expected_hard_max)X  Ni  )r     r   r.   r   Optional[int]r   c                    t          j        t          d| d|           5  t          j        ||           ddd           dS # 1 swxY w Y   dS )aw  `combine_text_under_n_chars` > `max_characters` can produce behavior confusing to users.

        The behavior is no different from `combine_text_under_n_chars == max_characters`, but if
        `max_characters` is left to default (500) and `combine_text_under_n_chars` is set to a
        larger number like 1500 then it can look like chunk-combining isn't working.
        zR'combine_text_under_n_chars' argument must not exceed `max_characters` value, got z > r   )r   r.   Nr   )r~   r.   r   r   s       r   ;it_rejects_combine_text_under_n_chars_greater_than_maxcharsz[Describe_ByTitleChunkingOptions.it_rejects_combine_text_under_n_chars_greater_than_maxchars  s     ]K2K K7HK K
 
 
 		 		 $'-Jd   		 		 		 		 		 		 		 		 		 		 		 		 		 		 		 		 		 		s   AA
A
c                    	 t          j        d          }n$# t          $ r t          j        d           Y nw xY w|j        dk    sJ dS )zPCaller can specify `new_after_n_chars` arg without specifying any other options.   )rX   z6did not accept `new_after_n_chars` as option by itselfN)r
   r   r   r   r   soft_maxr   s     r   @it_does_not_complain_when_specifying_new_after_n_chars_by_itselfz`Describe_ByTitleChunkingOptions.it_does_not_complain_when_specifying_new_after_n_chars_by_itself  sn    	R*.EEEDD 	R 	R 	RKPQQQQQ	R }######s    99)rG   rw   )TT)FFNrG   ry   rw   c                <    t          |          }|j        |u sJ d S )N)rG   )r
   rG   )r~   rG   rw   r   s       r   3it_knows_whether_to_break_chunks_on_page_boundarieszSDescribe_ByTitleChunkingOptions.it_knows_whether_to_break_chunks_on_page_boundaries  s/     ':LMMM&.888888r!   )r   r   )r.   r   r   r   r   r   )rG   ry   rw   ry   )r   r   r   r   r   r   r   r   r   r   r   r   r	   r   r   r!   r   r   r     s       \\[YS	22L L L 32L4 4 4
5 5 5 [M	?+   	 ($ $ $ [0	~.F'GH 9 9 9	 9 9 9r!   r   )+r   
__future__r   typingr   r   r   test_unstructured.unit_utilsr   r   r   unstructured.chunking.baser	   unstructured.chunking.titler
   r   "unstructured.documents.coordinatesr   unstructured.documents.elementsr   r   r   r   r   r   r   r   r   unstructured.partition.htmlr   r    r9   rB   rI   rL   rN   rU   r[   rb   rp   rs   ru   r   r   r!   r   <module>r      s<   ? > " " " " " "                  L L L L L L L L L L ? ? ? ? ? ? O O O O O O O O ? ? ? ? ? ?
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 7 6 6 6 6 6  $. . .bZ Z Z<  >  6  8$ $ $$ $ $6%[ %[ %[PP[ P[ P[f  :U U U U U U U U:B9 B9 B9 B9 B9 B9 B9 B9 B9 B9r!   