
    Ng              	         d dl mZ d dlZd dlZd dlmZmZmZmZ d dl	m
Z
 d dlmZmZ d dlmZ d dlmZmZmZmZmZmZmZmZmZmZ d dlmZ d d	lmZ d d
lm Z m!Z! d dl"m#Z# d dl$m%Z%m&Z& d dl'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-  e%ej.                  e
	 d2ddddddd3d                        Z/	 	 d4d5d#Z0d6d'Z1d7d)Z2	 d8d9d,Z3	 d:d9d.Z4d;d1Z5dS )<    )annotationsN)IOAnyCallableLiteral)add_chunking_strategy)auto_paragraph_grouperclean_bullets)CoordinateSystem)
AddressElementElementMetadataEmailAddressFooterHeaderListItemNarrativeTextTextTitle)read_txt_file)FileType)PARAGRAPH_PATTERNUNICODE_BULLETS_RE)exactly_one)apply_metadataget_last_modified_date)is_bulleted_textis_email_addressis_possible_narrative_textis_possible_numbered_listis_possible_titleis_us_city_state_ziptext)fileencodingr#   paragraph_grouperdetection_originfilename
str | Noner$   IO[bytes] | Noner%   r&   ,Callable[[str], str] | Literal[False] | Noner'   kwargsr   returnlist[Element]c               p   ||                                 dk    r|s| sg S t          | ||           d}| t          | |          \  }}n(|t          ||          \  }}n|t          |          }|du rn| ||          }nt	          |          }t          |          }g }	t          | rt          |           nd          }
||
_        |D ]d}|                                 }|rLt          |          s=t          |          }t          j        |
          |_        |	                    |           e|	S )a  Partition a .txt documents into its constituent paragraph elements.

    If paragraphs are below "min_partition" or above "max_partition" boundaries,
    they are combined or split.
    Parameters
    ----------
    filename
        A string defining the target filename path.
    file
        A file-like object using "rb" mode --> open(filename, "rb").
    encoding
        The encoding method used to decode the input bytes when drawn from `filename` or `file`.
        Defaults to "utf-8".
    text
        The string representation of the .txt document.
    paragrapher_grouper
        A str -> str function for fixing paragraphs that are interrupted by line breaks
        for formatting purposes.
    N )r(   r$   r#   )r(   r%   )r$   r%   F)last_modified)stripr   r   strr	   _split_by_paragraphr   r   r'   _is_empty_bulletelement_from_textcopydeepcopymetadataappend)r(   r$   r%   r#   r&   r'   r,   	file_textfile_contentelementsr9   ctextelements                W/var/www/html/ai-engine/env/lib/python3.11/site-packages/unstructured/partition/text.pypartition_textrA   (   sz   > DJJLLB..t.H.	 48888I+XQQQ))		+III))		II	E!!		&%%i00		*955	&y11L H:BL,X666  H !1H % % 	%)%00 	%'..G#}X66GOOG$$$O    r3   coordinates&tuple[tuple[float, float], ...] | Nonecoordinate_systemCoordinateSystem | Noner   c                T   t          ||          rt          | ||          S t          ||          rt          | ||          S t	          |           r!t          |           }t          |||          S t          |           rt          |           S t          |           rt          | ||          S t          |           rt          | ||          S t          |           rt          | ||          S t          |           rt          | ||          S t!          | ||          S )N)r#   rC   rE   r#   )_is_in_header_positionr   _is_in_footer_positionr   r   r
   r   r   r   r"   r   r    r   r   r!   r   r   )r#   rC   rE   
clean_texts       r@   r6   r6   o   s   
 k+<== 2
#/
 
 
 	

 
 ->	?	? ,
#/
 
 
 	

 
$		 &
"4((
#/
 
 
 	

 
$		 
&&&&	d	#	# 
#/
 
 
 	

 
#4	(	( 
#/
 
 
 	

 
$D	)	) 
#/
 
 
 	

 
4	 	  
#/
 
 
 	
 #/
 
 
 	
rB   tuple[tuple[float, float], ...]r   floatc                h    t          d | D                       t          |           z  }||j        z  S )Nc              3  &   K   | ]}|d          V  dS )   N ).0
coordinates     r@   	<genexpr>z)_get_height_percentage.<locals>.<genexpr>   s&      <<*
1<<<<<<rB   )sumlenheight)rC   rE   avg_ys      r@   _get_height_percentagerY      s<     <<<<<<<s;?O?OOE$+++rB   boolc                j    t          t          j        |           ot          |           dk              S )z(Checks if input text is an empty bullet.rP   )rZ   r   matchrV   rH   s    r@   r5   r5      s+    "(..A3t99>BBBrB   (\?	thresholdc                :    | |dS t          | |          }||k    S )zZChecks to see if the position of the text indicates that the text belongs
    to a footer.NFrY   rC   rE   r^   height_percentages       r@   rJ   rJ      s1     /7u.{<MNNy((rB   Q?c                :    | |dS t          | |          }||k     S )zVChecks to see if the position of the text indicates that the text belongs to a header.NFr`   ra   s       r@   rI   rI      s1     /7u.{<MNNy((rB   r;   	list[str]c                Z    t          j        t          |                                           S )zSplit text into paragraphs.)resplitr   r2   )r;   s    r@   r4   r4      s    8%y'8'8999rB   )N)r(   r)   r$   r*   r%   r)   r#   r)   r&   r+   r'   r)   r,   r   r-   r.   )NN)r#   r3   rC   rD   rE   rF   r-   r   )rC   rL   rE   r   r-   rM   )r#   r3   r-   rZ   )r]   )rC   rD   rE   rF   r^   rM   r-   rZ   )rc   )r;   r3   r-   re   )6
__future__r   r7   rg   typingr   r   r   r   unstructured.chunkingr   unstructured.cleaners.corer	   r
   "unstructured.documents.coordinatesr   unstructured.documents.elementsr   r   r   r   r   r   r   r   r   r    unstructured.file_utils.encodingr   unstructured.file_utils.modelr   unstructured.nlp.patternsr   r   $unstructured.partition.common.commonr   &unstructured.partition.common.metadatar   r    unstructured.partition.text_typer   r   r   r    r!   r"   TXTrA   r6   rY   r5   rJ   rI   r4   rQ   rB   r@   <module>rv      s   " " " " " "  				 - - - - - - - - - - - - 7 7 7 7 7 7        @ ? ? ? ? ?                        ; : : : : : 2 2 2 2 2 2 K K K K K K K K < < < < < < Y Y Y Y Y Y Y Y                B "FJ#)B B B B B  BN ;?157
 7
 7
 7
 7
~, , , ,C C C C ) ) ) ) )" 
) 
) 
) 
) 
): : : : : :rB   