
    g$                     v    d Z ddlZddlmZmZmZ ddlmZ ddlm	Z	m
Z
mZmZmZ ddlmZ  G d d	e          ZdS )
z 
Processor class for LayoutXLM.
    N)ListOptionalUnion   )ProcessorMixin)BatchEncodingPaddingStrategyPreTokenizedInput	TextInputTruncationStrategy)
TensorTypec            (       R    e Zd ZdZddgZdZdZd% fd	Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d&dee	e
ee	         ee
         f         deee
ee
         f                  deeee                  eeee                           f         deeee         eee                  f                  dedeeeef         deeeef         dee         dedee         dee         dee         dedededededeeeef                  def&dZd Zd  Zd! Zed"             Zed#             Zed$             Z xZS )'LayoutXLMProcessoran  
    Constructs a LayoutXLM processor which combines a LayoutXLM image processor and a LayoutXLM tokenizer into a single
    processor.

    [`LayoutXLMProcessor`] offers all the functionalities you need to prepare data for the model.

    It first uses [`LayoutLMv2ImageProcessor`] to resize document images to a fixed size, and optionally applies OCR to
    get words and normalized bounding boxes. These are then provided to [`LayoutXLMTokenizer`] or
    [`LayoutXLMTokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`,
    `attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned
    into token-level `labels` for token classification tasks (such as FUNSD, CORD).

    Args:
        image_processor (`LayoutLMv2ImageProcessor`, *optional*):
            An instance of [`LayoutLMv2ImageProcessor`]. The image processor is a required input.
        tokenizer (`LayoutXLMTokenizer` or `LayoutXLMTokenizerFast`, *optional*):
            An instance of [`LayoutXLMTokenizer`] or [`LayoutXLMTokenizerFast`]. The tokenizer is a required input.
    image_processor	tokenizerLayoutLMv2ImageProcessor)LayoutXLMTokenizerLayoutXLMTokenizerFastNc                    d|v r/t          j        dt                     |                    d          }||n|}|t	          d          |t	          d          t                                          ||           d S )Nfeature_extractorzhThe `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor` instead.z)You need to specify an `image_processor`.z"You need to specify a `tokenizer`.)warningswarnFutureWarningpop
ValueErrorsuper__init__)selfr   r   kwargsr   	__class__s        n/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/layoutxlm/processing_layoutxlm.pyr   zLayoutXLMProcessor.__init__3   s    &((M  
 !'

+> ? ?-<-H//N_"HIIIABBB)44444    TFr   text	text_pairboxesword_labelsadd_special_tokenspadding
truncation
max_lengthstridepad_to_multiple_ofreturn_token_type_idsreturn_attention_maskreturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_lengthverbosereturn_tensorsreturnc                 `   | j         j        r|t          d          | j         j        r|t          d          |du r|du rt          d          |                      ||          }|.| j         j        r"| t          |t                    r|g}|d         } | j        di d	||n|d         d
||ndd||n|d         d|d|d|d|d|	d|
d|d|d|d|d|d|d|d|d||}|                    d          }|du r|                     ||d                   }||d<   |S )a  
        This method first forwards the `images` argument to [`~LayoutLMv2ImagePrpcessor.__call__`]. In case
        [`LayoutLMv2ImagePrpcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
        bounding boxes along with the additional arguments to [`~LayoutXLMTokenizer.__call__`] and returns the output,
        together with resized `images`. In case [`LayoutLMv2ImagePrpcessor`] was initialized with `apply_ocr` set to
        `False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along with the additional
        arguments to [`~LayoutXLMTokenizer.__call__`] and returns the output, together with resized `images``.

        Please refer to the docstring of the above two methods for more information.
        NzdYou cannot provide bounding boxes if you initialized the image processor with apply_ocr set to True.zaYou cannot provide word labels if you initialized the image processor with apply_ocr set to True.TFzKYou cannot return overflowing tokens without returning the offsets mapping.)imagesr4   wordsr#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   pixel_valuesoverflow_to_sample_mappingimage )r   	apply_ocrr   
isinstancestrr   r   get_overflowing_images)r   r7   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r   featuresencoded_inputss                          r!   __call__zLayoutXLMProcessor.__call__D   s>   D ) 	u/@U  
 ) 	{/Fs   %,,1G51P1Pjkkk ''vn'UU  4 >9CT$$$ v )I' 
 
 
)x/@
#,#8iid
 !,%%(72C
 $	

  21
 G
 "z
 "z
 6
  21
 #8"7
 #8"7
 '@&?
 (B'A
 $:#9
  (-!
" G#
$ *>'
 
. n--$,,00Hd9effF"(wr"   c                     g }|D ]}|                     ||                    t          |          t          |          k    r/t          dt          |           dt          |                     |S )Nz`Expected length of images to be the same as the length of `overflow_to_sample_mapping`, but got z and )appendlenr   )r   r7   r:   images_with_overflow
sample_idxs        r!   r@   z)LayoutXLMProcessor.get_overflowing_images   s    !4 	< 	<J ''z(:;;;;#$$,F(G(GGGV,--V V478R4S4SV V  
 $#r"   c                 &     | j         j        |i |S )z
        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
        refer to the docstring of this method for more information.
        )r   batch_decoder   argsr   s      r!   rJ   zLayoutXLMProcessor.batch_decode   s    
 +t~*D;F;;;r"   c                 &     | j         j        |i |S )z
        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer
        to the docstring of this method for more information.
        )r   decoderK   s      r!   rN   zLayoutXLMProcessor.decode   s    
 %t~$d5f555r"   c                 
    g dS )N)	input_idsbboxattention_maskr;   r<   r   s    r!   model_input_namesz$LayoutXLMProcessor.model_input_names   s    ????r"   c                 D    t          j        dt                     | j        S )Nzg`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.)r   r   r   image_processor_classrS   s    r!   feature_extractor_classz*LayoutXLMProcessor.feature_extractor_class   s'    u	
 	
 	
 ))r"   c                 D    t          j        dt                     | j        S )Nz[`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.)r   r   r   r   rS   s    r!   r   z$LayoutXLMProcessor.feature_extractor   s'    i	
 	
 	
 ##r"   )NN)NNNNTFNNr   NNNFFFFTN)__name__
__module____qualname____doc__
attributesrV   tokenizer_classr   r   r   r
   r   r   intboolr?   r	   r   r   r   rC   r@   rJ   rN   propertyrT   rW   r   __classcell__)r    s   @r!   r   r      s        & $[1J6FO5 5 5 5 5 5( _cQU?CCG#'5:;?$(,00404*/+0',#;?)U U I0$y/4HYCZZ[U E"3T:K5L"LMN	U
 T$s)_d4S	?&;;<U eDItDI$>?@U !U tS/12U $%778U SMU U %SMU  (~U  (~U $(U  %)!U" !%#U$ %U& 'U( !sJ!78)U, 
-U U U Un$ $ $< < <6 6 6 @ @ X@ * * X* $ $ X$ $ $ $ $r"   r   )r\   r   typingr   r   r   processing_utilsr   tokenization_utils_baser   r	   r
   r   r   utilsr   r   r<   r"   r!   <module>rg      s      ( ( ( ( ( ( ( ( ( ( . . . . . . w w w w w w w w w w w w w w      n$ n$ n$ n$ n$ n$ n$ n$ n$ n$r"   