
    Ng69                       d dl mZ d dlZd dlZd dlZd dlZd dlmZmZ d dl	Z
d dlmZ d dlmZmZmZmZmZmZmZ d=dZd>d
Zd=dZd=dZd?dZeefd@dZefdAdZefdBdZeddfdCdZd>dZe                     d  e!ej"                  D                       Z#dDd!Z$dEd$Z%d>d%Z&d>d&Z'd>d'Z(dFdGd*Z)dHdId1Z*dHdId2Z+	 	 	 	 	 dJdKd8Z,dFdLd9Z-dMd;Z.dNd<Z/dS )O    )annotationsN)OptionalTuple)format_encoding_str)DOUBLE_PARAGRAPH_PATTERN_REE_BULLET_PATTERNLINE_BREAK_REPARAGRAPH_PATTERNPARAGRAPH_PATTERN_REUNICODE_BULLETS_REUNICODE_BULLETS_RE_0Wreturnstrc                V    |                      dd          }|                                S )u   Cleans non-ascii characters from unicode string.

    Example
    -------
    This text contains non-ascii characters!
        -> This text contains non-ascii characters!
    asciiignore)encodedecode)textens     V/var/www/html/ai-engine/env/lib/python3.11/site-packages/unstructured/cleaners/core.pyclean_non_ascii_charsr      s%     
Wh	'	'B99;;    r   c                    t          j        |           }|| S t          j        d| d          }|                                S )u   Cleans unicode bullets from a section of text.

    Example
    -------
    ●  This is an excellent point! -> This is an excellent point!
    N    )r   matchsubstrip)r   searchcleaned_texts      r   clean_bulletsr"   %   sD      %d++F~%)"dA66Lr   c                2   |                                  }d                    |dd                   }t          d|d         vd|d         v g          r| S t          j         d|d                   }|d	         s|d	= t	          |d                   d
k    r| S |S )u/  Cleans the start of bulleted text sections up to three “sub-section”
    bullets accounting numeric and alphanumeric types.

    Example
    -------
    1.1 This is a very important point -> This is a very important point
    a.b This is a very important point -> This is a very important point
     r   N.r   z..z[\.])patternstring   )splitjoinanyrelen)r   text_sptext_clbullets       r   clean_ordered_bulletsr2   4   s     jjllGhhwqrr{##G
Cwqz!471:#5677 Xggaj999F": 2J
6!9~~Nr   c                    ddddddddd	d
ddddd}| }|                                 D ]\  }}|                    ||          }|S )u   Replaces ligatures with their most likely equivalent characters.

    Example
    -------
    The beneﬁts -> The benefits
    High quality ﬁnancial -> High quality financial
    aeAEfffiflffifflftlsoeOEqpstts)   æ   Æu   ﬀu   ﬁu   ﬂu   ﬃu   ﬄu   ﬅu   ʪu   œu   Œu   ȹu   ﬆu   ʦ)itemsreplace)r   ligatures_mapr!   kvs        r   clean_ligaturesrI   L   s}      M  L##%% 2 21#++Aq11r   	paragraphlistc                   g }t          j        t          d|                                           } t          j        t
          |           }|D ]2}|r.|                    t          j        t          d|                     3|S )uk  Groups paragraphs with bullets that have line breaks for visual/formatting purposes.
    For example:

    '''○ The big red fox
    is walking down the lane.

    ○ At the end of the lane
    the fox met a friendly bear.'''

    Gets converted to

    '''○ The big red fox is walking down the lane.
    ○ At the end of the land the fox met a bear.'''
       ·r$   )r-   r   r   r   r*   r   appendr
   )rJ   clean_paragraphsbullet_parasr1   s       r   group_bullet_paragraphrQ   k   s      ($	::AACCI819==L L L 	L##BF+<c6$J$JKKKr   
line_splitre.Pattern[str]paragraph_splitc                l   |                     |           }g }|D ]}|                                s|                     |          }t          d |D                       }t          j        |                                          s&t          j        |                                          r#|                    t          |                     |r |                    d |D                        |                    t          j
        t          d|                     d                    |          S )aE  Groups paragraphs that have line breaks for visual/formatting purposes.
    For example:

    '''The big red fox
    is walking down the lane.

    At the end of the lane
    the fox met a bear.'''

    Gets converted to

    '''The big red fox is walking down the lane.
    At the end of the land the fox met a bear.'''
    c              3     K   | ]<}t          |                                                    d                     dk     V  =dS )r$      N)r.   r   r*   .0lines     r   	<genexpr>z*group_broken_paragraphs.<locals>.<genexpr>   sE      VV4c$**,,"4"4S"9"9::Q>VVVVVVr   c                :    g | ]}|                                 |S  )r   rX   s     r   
<listcomp>z+group_broken_paragraphs.<locals>.<listcomp>   s%    $Q$Q$QdDJJLL$QT$Q$Q$Qr   r$   

)r*   r   allr   r   r   extendrQ   rN   r-   r   r
   r+   )r   rR   rT   
paragraphsrO   rJ   
para_splitall_lines_shorts           r   group_broken_paragraphsre      s;   & !&&t,,J O O	   	  %%i00
VV:VVVVV#IOO$5$566 	O:J:PQZQ`Q`QbQb:c:c 	O##$:9$E$EFFFF 	O##$Q$Qj$Q$Q$QRRRR##BF+<c9$M$MNNNN;;'(((r   c                    |                     |           }g }|D ],}|                                s|                    |           -d                    |          S )a,  
    Concatenates text document that has one-line paragraph break pattern

    For example,

    Iwan Roberts
    Roberts celebrating after scoring a goal for Norwich City
    in 2004

    Will be returned as:

    Iwan Roberts

Roberts celebrating after scoring a goal for Norwich City

in 2004
    r_   )r*   r   rN   r+   )r   rT   rb   rO   rJ   s        r   new_line_grouperrg      sk    " !&&t,,J + +	   		****;;'(((r   
re.Patternc                     t          |           S )a  
    Concatenates text document that has blank-line paragraph break pattern

    For example,

    Vestibulum auctor dapibus neque.

    Nunc dignissim risus id metus.

    Will be returned as:

    Vestibulum auctor dapibus neque.

Nunc dignissim risus id metus.



    )re   )r   rT   s     r   blank_line_grouperrj      s    $ #4(((r   i  g?max_line_countint	thresholdfloatc                   |                     |           }t          t          |          |          }d\  }}|d|         D ] }|dz  }|                                s|dz  }!||z  }||k     rt	          |           S t          |           S )a  
    Checks the ratio of new line (
) over the total max_line_count

    If the ratio of new line is less than the threshold,
    the document is considered a new-line grouping type
    and return the original text

    If the ratio of new line is greater than or equal to the threshold,
    the document is considered a blank-line grouping type
    and passed on to blank_line_grouper function
    )r   r   Nr   )r*   minr.   r   rg   rj   )	r   rR   rk   rm   lines
line_countempty_line_countrZ   ratios	            r   auto_paragraph_grouperru      s    " T""EU^44N#' J o~o& " "a
zz|| 	"!z)E y%%%!$'''r   c                   |                      dd          } |                      dd          } |                      dd          } |                      dd          } |                      d	d
          } |                      dd
          } |                      dd          } |                      dd          } |                      dd          } |                      dd          } |                      dd          } |                      dd          } |                      dd          } |                      dd          } |                      dd          } |                      dd          } |                      dd          } |                      dd          } |                      dd          } |                      dd          } |                      dd          } |                      d d          } |                      d!d          } | S )"u   Replaces unicode bullets in text with the expected character

    Example
    -------
    What a lovely quote! -> “What a lovely quote!”
       u   ‘   u   ’   u   “   u   ”z&apos;'u   âu   â“u   —u   â”u   –u   â˜u   â¦u   …u   â™u   âœu   â?u   âťu   âśu   â¨u   âłu   ″u   âŽr   u   â‚u   â‰u   â‹u   âu   âs')rE   r   s    r   replace_unicode_quotesr}     s    <<&&D<<&&D<<&&D<<&&D<<#&&D<<c**D<<U++D<<U++D<<
E**D<<
E**D<<U++D<<
E**D<<	5))D<<
E**D<<
E**D<<
E**D<<
E**D<<
B''D<<R((D<<R((D<<R((D<<"%%D<<
B''DKr   c              #     K   | ]:}t          j        t          |                                        d           6|V  ;dS )PN)unicodedatacategorychr
startswith)rY   is     r   r[   r[   (  s[        
(<SVV(D(D(O(OPS(T(T     r   sc                :    |                      t                    } | S )z(Removes punctuation from a given string.)	translatetbl)r   s    r   remove_punctuationr   -  s    	CAHr   exclude_punctuationOptional[list]c                    t                                           }|r|D ]}|t          |          = |                     |          } | S N)r   copyordr   )r   r   tbl_newpuncts       r   remove_sentence_punctuationr   3  sM    hhjjG $( 	$ 	$EE

##	GAHr   c                    t          j        dd|           }t          j        dd|          }|                                S )zCleans extra whitespace characters that appear between words.

    Example
    -------
    ITEM 1.     BUSINESS -> ITEM 1. BUSINESS
    [\xa0\n]r$   	([ ]{2,})r-   r   r   )r   r!   s     r   clean_extra_whitespacer   <  s<     6+sD11L6,\::Lr   c                R    t          j        dd|                                           S )zhCleans dash characters in text.

    Example
    -------
    ITEM 1. -BUSINESS -> ITEM 1.  BUSINESS
    z	[-\u2013]r$   r   r|   s    r   clean_dashesr   H  s$     6,T**00222r   c                P    |                                                      d          S )zvClean all trailing punctuation in text

    Example
    -------
    ITEM 1.     BUSINESS. -> ITEM 1.     BUSINESS
    z.,:;)r   rstripr|   s    r   clean_trailing_punctuationr   S  s      ::<<v&&&r   utf-8encodingc                    t          |          }t          j        |                     |                                        |          S )u   Replaces MIME encodings with their equivalent characters in the specified encoding.

    Example
    -------
    5 w=E2=80-99s -> 5 w’s
    )r   quopridecodestringr   r   )r   r   formatted_encodings      r   replace_mime_encodingsr   ]  s?     -X66t{{+=>>??FFGYZZZr   FTr&   ignore_caseboolr   c                    |rt           j        nd}t          j        d| d| |          }|r|                                n|}|S )a  Removes prefixes from a string according to the specified pattern. Strips leading
    whitespace if the strip parameter is set to True.

    Input
    -----
    text: The text to clean
    pattern: The pattern for the prefix. Can be a simple string or a regex pattern
    ignore_case: If True, ignores case in the pattern
    strip: If True, removes leading whitespace from the cleaned string.
    r   ^r   flags)r-   
IGNORECASEr   lstripr   r&   r   r   r   
clean_texts         r   clean_prefixr   h  sS     )/BMMaEWD>>>J(-=""$$$:Jr   c                    |rt           j        nd}t          j        | dd| |          }|r|                                n|}|S )a  Removes postfixes from a string according to the specified pattern. Strips trailing
    whitespace if the strip parameters is set to True.

    Input
    -----
    text: The text to clean
    pattern: The pattern for the postfix. Can be a simple string or a regex pattern
    ignore_case: If True, ignores case in the pattern
    strip: If True, removes trailing whitespace from the cleaned string.
    r   $r   r   )r-   r   r   r   r   s         r   clean_postfixr   y  sS     )/BMMaEGD>>>J(-=""$$$:Jr   extra_whitespacedashesbulletstrailing_punctuation	lowercasec                    |r|                                  n| }|rt          |          n|}|rt          |          n|}|rt          |          n|}|rt	          |          n|}|                                S )al  Cleans text.

    Input
    -----
    extra_whitespace: Whether to clean extra whitespace characters in text.
    dashes: Whether to clean dash characters in text.
    bullets: Whether to clean unicode bullets from a section of text.
    trailing_punctuation: Whether to clean all trailing punctuation in text.
    lowercase: Whether to return lowercase text.
    )lowerr   r   r   r"   r   )r   r   r   r   r   r   r!   s          r   cleanr     s    & $-64::<<<$L4HZ"<000l  28I<---\L;K]),777Q]L29K=...|Lr   c                |    t          d | D                       }t          |          }|                    |          S )zgConverts a string representation of a byte string to a regular string using the
    specified encoding.c                ,    g | ]}t          |          S r]   )r   )rY   chars     r   r^   z*bytes_string_to_string.<locals>.<listcomp>  s    333dD		333r   )bytesr   r   )r   r   
text_bytesr   s       r   bytes_string_to_stringr     sC     33d33344J,X66/000r   Tuple[str, np.ndarray]c                4   t          j        dd|           }t          j        dd|          }|                                }t          j        t          |                     }d\  }}}|t          |          k     r| |         ||         k    sPt          t          j        d| |                             r8t          t          j        d||                             r|||<   |dz  }|dz  }|dz  }|||<   |dz  }|t          |          k     |||d<   ||fS )a  Cleans extra whitespace characters that appear between words.
    Calculate distance between characters of original text and cleaned text.

    Returns cleaned text along with array of indices it has moved from original.

    Example
    -------
    ITEM 1.     BUSINESS -> ITEM 1. BUSINESS
    array([0., 0., 0., 0., 0., 0., 0., 0., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4.]))
    r   r$   r   )r   r   r   u   [ 
]r   N)r-   r   r   npzerosr.   r   r   )r   r!   moved_indicesdistanceoriginal_indexcleaned_indexs         r   %clean_extra_whitespace_with_index_runr     s6    6+sD11L6,\::L%%''LHSYY''M.5+Hnm
#l++
+
+<#>>>*d>&:;;<< ?RXc<#>??@@ ? ,4M-(aNQMA'/m$! #l++
+
+ %-M-..!&&r   c                2    t          | ||          z
            S r   )rl   )indexr   s     r   -index_adjustment_after_clean_extra_whitespacer     s    u}U++,,,r   )r   r   )r   r   r   r   )rJ   r   r   rK   )r   r   rR   rS   rT   rS   r   r   )r   r   rT   rS   r   r   )r   r   rT   rh   r   r   )
r   r   rR   rS   rk   rl   rm   rn   r   r   )r   r   r   r   )r   r   r   r   r   r   )r   )r   r   r   r   r   r   )FT)
r   r   r&   r   r   r   r   r   r   r   )FFFFF)r   r   r   r   r   r   r   r   r   r   r   r   r   r   )r   r   r   r   )r   r   r   r   )r   rl   )0
__future__r   r   r-   sysr   typingr   r   numpyr    unstructured.file_utils.encodingr   unstructured.nlp.patternsr   r   r	   r
   r   r   r   r   r"   r2   rI   rQ   re   rg   rj   ru   r}   dictfromkeysrange
maxunicoder   r   r   r   r   r   r   r   r   r   r   r   r   r]   r   r   <module>r      s=   " " " " " "  				 



     " " " " " " " "                          	 	 	 	          0   >   < #7'B') ') ') ') ')X (5) ) ) ) )8 #>) ) ) ) ). #0	( ( ( ( (J       F mm  uS^$$    
      	  	  	  	 3 3 3 3' ' ' '[ [ [ [ [    "    & #!&         :1 1 1 1 1$' $' $' $'N- - - - - -r   