
    Ng                     t   d dl Z d dlZd dlmZmZ d dlmZmZmZm	Z	m
Z
mZmZ ddedededej        fdZddededed
edef
dZddededed
edef
dZdedee         fdZdedee         fdZdedee         fdZdedee         fdZdedee j                  fdZdefdZdefdZdedee         fdZdS )    N)ListOptional)EMAIL_ADDRESS_PATTERNEMAIL_DATETIMETZ_PATTERNIMAGE_URL_PATTERNIP_ADDRESS_NAME_PATTERNIP_ADDRESS_PATTERN_REMAPI_ID_PATTERNUS_PHONE_NUMBERS_REtextpatternindexreturnc                     t          |t                    r|dk     rt          d| d          d }t          t	          j        ||                     D ]\  }}||k    r|}|t          d| d| d          |S )Nr   zThe index is z'. Index must be a non-negative integer.zResult with index z& was not found. The largest index was .)
isinstanceint
ValueError	enumeraterefinditer)r   r   r   regex_matchiresults         Y/var/www/html/ai-engine/env/lib/python3.11/site-packages/unstructured/cleaners/extract.py_get_indexed_matchr      s    eS!! YUQYYWWWWXXXKr{7D99:: ! !	6:: K_e__[\___```    Tstripc                     t          | ||          }|                                \  }}| d|         }|r|                                n|S )a  Extracts texts that occurs before the specified pattern. By default, it will use
    the first occurrence of the pattern (index 0). Use the index kwarg to choose a different
    index.

    Input
    -----
    strip: If True, removes trailing whitespace from the extracted string
    N)r   spanrstrip)r   r   r   r   r   start_before_texts           r   extract_text_beforer%      sS     %T7E::K!!HE1vv,K#(9;k9r   c                     t          | ||          }|                                \  }}| |d         }|r|                                n|S )a  Extracts texts that occurs before the specified pattern. By default, it will use
    the first occurrence of the pattern (index 0). Use the index kwarg to choose a different
    index.

    Input
    -----
    strip: If True, removes leading whitespace from the extracted string
    N)r   r    lstrip)r   r   r   r   r   r#   endr$   s           r   extract_text_afterr)   .   sS     %T7E::KFAsstt*K#(9;k9r   c                 Z    t          j        t          |                                           S N)r   findallr   lowerr   s    r   extract_email_addressr/   =   s    :+TZZ\\:::r   c                 6    t          j        t          |           S r+   )r   r,   r	   r.   s    r   extract_ip_addressr1   A   s    :+T222r   c                 6    t          j        t          |           S r+   )r   r,   r   r.   s    r   extract_ip_address_namer3   E   s    :-t444r   c                 R    t          j        t          |           }d |D             }|S )Nc                 :    g | ]}|                     d d          S ); )replace).0mids     r   
<listcomp>z#extract_mapi_id.<locals>.<listcomp>K   s&    999C$$999r   )r   r,   r
   )r   mapi_idss     r   extract_mapi_idr=   I   s+    z/400H99999HOr   c                     t          j        t          |           }t          |          dk    r&t          j                            |d         d          S d S )Nr   z%a, %d %b %Y %H:%M:%S %z)r   r,   r   lendatetimestrptime)r   date_extractionss     r   extract_datetimetzrC   O   sN    z":DAA
q   ))*:1*=?YZZZtr   c                     t          j        |           }|dS |                                \  }}| ||         }|                                S )a  Extracts a US phone number from a section of text that includes a phone number. If there
    is no phone number present, the result will be an empty string.

    Example
    -------
    extract_phone_number("Phone Number: 215-867-5309") -> "215-867-5309"
    Nr7   )r   searchr    r   )r   r   r"   r(   phone_numbers        r   extract_us_phone_numberrG   W   sS     &,T22Kr!!##JE3c	?Lr   c                    d\  }}}}|                                  }t          d|d         vd|d         v g          r|||fS t          j         d|d                   }|d         s|d= t          |d                   dk    r|||fS |^}}|rG	 |^}}}n# t          $ r |}Y nw xY wd	                    |          }|rd	                    |          nd
}|||fS )a  Extracts the start of bulleted text sections bullets
    accounting numeric and alphanumeric types.

    Output
    -----
    tuple(section, sub_section, sub_sub_section): Each bullet partition
    is a string or None if not present.

    Example
    -------
    This is a very important point -> (None, None, None)
    1.1 This is a very important point -> ("1", "1", None)
    a.1 This is a very important point -> ("a", "1", None)
    )NNNNr   r   z..z[\.])r   string   r7   N)splitanyr   r?   r   join)r   abctemptext_spbulletr#   s           r   extract_ordered_bulletsrU   h   s    +MAq!TjjllG
Cwqz!471:#5677 !QwXggaj999F": 2J
6!9~~!QwHA &	HAq11 	 	 	AAA	GGAJJ%BGGAJJJa7Ns   B B#"B#c                 6    t          j        t          |           S r+   )r   r,   r   r.   s    r   extract_image_urls_from_htmlrW      s    :'...r   )r   )r   T)r@   r   typingr   r   unstructured.nlp.patternsr   r   r   r   r	   r
   r   strr   Matchr   boolr%   r)   r/   r1   r3   r=   rC   rG   tuplerU   rW    r   r   <module>r_      sc    				 ! ! ! ! ! ! ! !                  S 3 s 28    : :c :C : : :X[ : : : :: :S :3 :s :t :WZ : : : :; ;S	 ; ; ; ;3S 3T#Y 3 3 3 35# 5$s) 5 5 5 5# $s)    S Xh.?%@     #        "#U # # # #L/s /tCy / / / / / /r   