
    g^"                         d Z ddlZddlZddlmZ ddlmZmZmZ ddl	m
Z
 ddlmZmZmZ ddlmZmZ  G d	 d
ed          Z G d de          ZdS )z
Processor class for Donut.
    N)contextmanager)ListOptionalUnion   )
ImageInput)ProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInputc                       e Zd Zi ZdS )DonutProcessorKwargsN)__name__
__module____qualname__	_defaults     f/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/donut/processing_donut.pyr   r      s        IIIr   r   F)totalc            
            e Zd ZdZddgZdZdZd fd	Z	 	 	 	 dded	e	e
eee         eef                  d
ee         fdZd Zd Zed             ZddZed             Zed             Z xZS )DonutProcessora  
    Constructs a Donut processor which wraps a Donut image processor and an XLMRoBERTa tokenizer into a single
    processor.

    [`DonutProcessor`] offers all the functionalities of [`DonutImageProcessor`] and
    [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]. See the [`~DonutProcessor.__call__`] and
    [`~DonutProcessor.decode`] for more information.

    Args:
        image_processor ([`DonutImageProcessor`], *optional*):
            An instance of [`DonutImageProcessor`]. The image processor is a required input.
        tokenizer ([`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`], *optional*):
            An instance of [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]. The tokenizer is a required input.
    image_processor	tokenizerAutoImageProcessorAutoTokenizerNc                 ,   d }d|v r/t          j        dt                     |                    d          }||n|}|t	          d          |t	          d          t                                          ||           | j        | _        d| _	        d S )Nfeature_extractorzhThe `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor` instead.z)You need to specify an `image_processor`.z"You need to specify a `tokenizer`.F)
warningswarnFutureWarningpop
ValueErrorsuper__init__r   current_processor_in_target_context_manager)selfr   r   kwargsr   	__class__s        r   r&   zDonutProcessor.__init__5   s     &((M  
 !'

+> ? ?-<-H//N_"HIIIABBB)444!%!5*/'''r   imagestextr*   c                 *   | j         r | j        ||fi |S ||t          d           | j        t          fd| j        j        i|}| | j        |fi |d         }| | j        |fi |d         }||S ||S |d         |d<   |d         |d<   |S )a  
        When used in normal mode, this method forwards all its arguments to AutoImageProcessor's
        [`~AutoImageProcessor.__call__`] and returns its output. If used in the context
        [`~DonutProcessor.as_target_processor`] this method forwards all its arguments to DonutTokenizer's
        [`~DonutTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more information.
        NzBYou need to specify either an `images` or `text` input to process.tokenizer_init_kwargsimages_kwargstext_kwargs	input_idslabels)r(   r'   r$   _merge_kwargsr   r   init_kwargsr   )	r)   r,   r-   audiovideosr*   output_kwargsinputs	encodingss	            r   __call__zDonutProcessor.__call__I   s     * 	B)4)&$AA&AAA>dlabbb** 
 
"&."<
 
 
 )T)&SSM/4RSSF&tLL}]/KLLI<M^(5F8"+K"8F;Mr   c                 &     | j         j        |i |S )z
        This method forwards all its arguments to DonutTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please refer
        to the docstring of this method for more information.
        )r   batch_decoder)   argsr*   s      r   r=   zDonutProcessor.batch_decoder   s    
 +t~*D;F;;;r   c                 &     | j         j        |i |S )z
        This method forwards all its arguments to DonutTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to the
        docstring of this method for more information.
        )r   decoder>   s      r   rA   zDonutProcessor.decodey   s    
 %t~$d5f555r   c              #      K   t          j        d           d| _        | j        | _        dV  | j        | _        d| _        dS )z
        Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning TrOCR.
        z`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your labels by using the argument `text` of the regular `__call__` method (either in the same call as your images inputs, or in a separate call.TNF)r    r!   r(   r   r'   r   r)   s    r   as_target_processorz"DonutProcessor.as_target_processor   sW      
 	9	
 	
 	

 +/'!%!%!5*/'''r   Fc                    || j                                         }i }|rt          j        d|t          j                  }|nh|                    d          }t          j        |          }t          j        d| d|t          j                  }|                                }||                    |d          }n|                                }t          j        |          }	t          j        |          }
t          j        |	 d|
 |t          j        t          j        z            }||                    d          	                                }d|v r?d|v r;| 
                    |d	|
          }|r t          |          dk    r|d         }|||<   ng ||<   |                    d          D ]Y}|	                                }||v r$|d         dk    r|dd         dk    r
|dd         }||                             |           Zt          ||                   dk    r||         d         ||<   ||                    |          t          |          z   d         	                                }|dd         dk    r$|g| 
                    |dd         d	|
          z   S |t          |          r|r|gn|S |rg nd|iS )zS
        Convert a (generated) token sequence into an ordered JSON format.
        Nz	<s_(.*?)>   z</s_> z(.*?)z<s_T)is_inner_valueadded_vocabr   z<sep/><z/>   text_sequence)r   get_added_vocabresearch
IGNORECASEgroupescapereplaceDOTALLstrip
token2jsonlensplitappendfind)r)   tokensrI   rJ   outputstart_tokenkeykey_escaped	end_tokenstart_token_escapedend_token_escapedcontentvalueleafs                 r   rX   zDonutProcessor.token2json   s    .88::K &	p)L&"-HHK"##A&&C)C..K	"8+"8"8"8&"-PPI%++--K R88%OO--	&(i&<&<#$&Ii$8$8!)*DD1BDDfbm^`^gNg  &%mmA..4466G((W-?-? $Ze f f  0"5zzQ(-a*/F3K&(s$+MM)$<$< 5 5D#'::<<D#{22tAw#~~$rss)W[J[J['+AbDz"3K..t4444vc{++q00*0+a.F3KI 6 6Y G I IJPPRR"1":**"8doofQRRjQUcno&o&oooM  &	pP v;; 	G-9F8869'F22ov-FFr   c                 D    t          j        dt                     | j        S )Nzg`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.)r    r!   r"   image_processor_classrC   s    r   feature_extractor_classz&DonutProcessor.feature_extractor_class   s'    u	
 	
 	
 ))r   c                 D    t          j        dt                     | j        S )Nz[`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.)r    r!   r"   r   rC   s    r   r   z DonutProcessor.feature_extractor   s'    i	
 	
 	
 ##r   )NN)NNNN)FN)r   r   r   __doc__
attributesri   tokenizer_classr&   r   r   r   strr   r   r   r   r   r;   r=   rA   r   rD   rX   propertyrj   r   __classcell__)r+   s   @r   r   r   !   sH         $[1J0%O0 0 0 0 0 0, "NR' '' uS$s)Y8IIJK' -.' ' ' 'R< < <6 6 6 0 0 ^04G 4G 4G 4Gl * * X* $ $ X$ $ $ $ $r   r   )rl   rP   r    
contextlibr   typingr   r   r   image_utilsr   processing_utilsr	   r
   r   tokenization_utils_baser   r   r   r   r   r   r   <module>rw      s    
			  % % % % % % ( ( ( ( ( ( ( ( ( ( % % % % % % H H H H H H H H H H C C C C C C C C    +5    s$ s$ s$ s$ s$^ s$ s$ s$ s$ s$r   