
    Ng                         d dl Z d dlZd dlZd dlZddlmZmZ ddl mZ ddl	m
Z
  e
            rd dlZ G d de          Z G d	 d
e          ZdS )    N   )BaseOCRAgentBaseOCRElementType   )load_dataframe)is_pytesseract_availablec                   R    e Zd ZdZdZdZdZdZdZe	d             Z
e	d             Zd	S )
TesseractFeatureTypez7
    The element types for Tesseract Detection API
    r   r   r         c           
          t           j        dt           j        dt           j        dt           j        dt           j        di}||          S )Npage_num	block_numpar_numline_numword_num)r
   PAGEBLOCKPARALINEWORD)selfname_cvts     \/var/www/html/ai-engine/env/lib/python3.11/site-packages/layoutparser/ocr/tesseract_agent.py	attr_namezTesseractFeatureType.attr_name(   s?     !%z & %y %z %z
 ~    c                 $    g d}|d | dz            S )N)r   r   r   r   r   r    )r   levelss     r   group_levelsz!TesseractFeatureType.group_levels3   s     MMMjqj!!r   N)__name__
__module____qualname____doc__r   r   r   r   r   propertyr   r    r   r   r   r
   r
      sm          DEDDD  X " " X" " "r   r
   c                       e Zd ZdZdgZddZed             Zd Z	 dd
Z	e
d             Ze
d             Ze
d             Zd	S )TesseractAgentz
    A wrapper for `Tesseract <https://github.com/tesseract-ocr/tesseract>`_ Text
    Detection APIs based on `PyTesseract <https://github.com/tesseract-ocr/tesseract>`_.
    pytesseractengc                 v    t          |t                    r|nd                    |          | _        || _        dS )af  Create a Tesseract OCR Agent.

        Args:
            languages (:obj:`list` or :obj:`str`, optional):
                You can specify the language code(s) of the documents to detect to improve
                accuracy. The supported language and their code can be found on
                `its github repo <https://github.com/tesseract-ocr/langdata>`_.
                It supports two formats: 1) you can pass in the languages code as a string
                of format like `"eng+fra"`, or 2) you can pack them as a list of strings
                `["eng", "fra"]`.
                Defaults to 'eng'.
        +N)
isinstancestrjoinlangconfigs)r   	languageskwargss      r   __init__zTesseractAgent.__init__A   s5     ",Is!;!;TII)ATAT	r   c                 4    |t           j         _         | di |S )Nr   )r(   tesseract_cmd)clstesseract_cmd_pathr2   s      r   with_tesseract_executablez(TesseractAgent.with_tesseract_executableQ   s"     1C-s}}V}}r   c                    i }t          j        |fd| j        i| j        |d<   t          j        |fd| j        i| j        }t          j        t          j        |          t          j
        dddt          i          |d<   |S )Nr/   textzutf-8	)quotingencodingsep
convertersdata)r(   image_to_stringr/   r0   image_to_datapdread_csvioStringIOcsv
QUOTE_NONEr-   )r   img_contentres_datas       r   _detectzTesseractAgent._detectW   s    !1
 
"i
+/<
 
F )+VVDIVVVkKN}
 
 
F 
r   FTNc                     |                      |          }|r|S |r|d         S ||                     ||          S |d         S )a  Send the input image for OCR.

        Args:
            image (:obj:`np.ndarray` or :obj:`str`):
                The input image array or the name of the image file
            return_response (:obj:`bool`, optional):
                Whether directly return all output (string and boxes
                info) from Tesseract.
                Defaults to `False`.
            return_only_text (:obj:`bool`, optional):
                Whether return only the texts in the OCR results.
                Defaults to `False`.
            agg_output_level (:obj:`~TesseractFeatureType`, optional):
                When set, aggregate the GCV output with respect to the
                specified aggregation level. Defaults to `None`.
        r:   )rL   gather_data)r   imagereturn_responsereturn_only_textagg_output_levelrJ   s         r   detectzTesseractAgent.detectf   s[    ( ll5!! 	J 	v;'##C)9:::6{r   c           
         t          |t                    sJ d|             | d         }||j                                                                      |j                                      d                               d                                                              dddd	d
ddd          	                    d d d          
                    dd	g          }t          |          S )zo
        Gather the OCR'ed text, bounding boxes, and confidence
        in a given aggeragation level.
        zInvalid agg_level r@   c           
      d   t          j        | d                                         | d                                         | d                                         | d                                         | d                                         | d         j                            d          g          S )	Nlefttopwidthheightconfr:    )r>   )rC   Seriesminmaxmeanr-   cat)gps    r   <lambda>z,TesseractAgent.gather_data.<locals>.<lambda>   s    296
((5	7))8((**6
))6
**s*33	 	 r   T)dropx_1y_1whscorer:   id)r   r   r   r   r      index)columnsc                      | j         | j        z   S N)rd   rf   xs    r   rb   z,TesseractAgent.gather_data.<locals>.<lambda>       aeack r   c                      | j         | j        z   S rn   )re   rg   ro   s    r   rb   z,TesseractAgent.gather_data.<locals>.<lambda>   rq   r   	rectangle)x_2y_2
block_type)r,   r
   r:   isnagroupbyr    applyreset_indexrenameassignrc   r   )response	agg_levelrJ   dfs       r   rN   zTesseractAgent.gather_data   s#    +
 
 	, 	,+	++	, 	, 	, v !WY+,,U	 	  [d[##[]]V!   
 
 V))))&   
 T3*T%%C 	H b!!!r   c                 ~    t          | d          5 }t          j        |          }d d d            n# 1 swxY w Y   |S )Nrb)openpickleload)filenamefprJ   s      r   load_responsezTesseractAgent.load_response   sv    (D!! 	"R+b//C	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	"
s   266c                     t          |d          5 }t          j        | |t          j                   d d d            d S # 1 swxY w Y   d S )Nwb)protocol)r   r   dumpHIGHEST_PROTOCOL)rJ   	file_namer   s      r   save_responsezTesseractAgent.save_response   s     )T"" 	CbKR&*ABBBB	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	Cs   "A  AA)r)   )FTN)r!   r"   r#   r$   DEPENDENCIESr3   classmethodr8   rL   rS   staticmethodrN   r   r   r   r   r   r'   r'   9   s         
 "?L       [
    UY   B -" -" \-"^   \
 C C \C C Cr   r'   )rE   rG   r   pandasrC   baser   r   r   
file_utilsr   r(   r
   r'   r   r   r   <module>r      s    
			 



      2 2 2 2 2 2 2 2       1 1 1 1 1 1 " " " " "- " " "8HC HC HC HC HC\ HC HC HC HC HCr   