
    g2                        d Z ddlmZmZ ddlmZ ddlmZmZm	Z	 ddl
mZmZmZmZ ddlmZmZ ddlmZmZmZmZmZ  ej        e          Z G d	 d
ed          ZdefdZd Z G d de          Z G d de          Z dS )z
Processor class for Pixtral.
    )ListUnion   )BatchFeature)
ImageInputis_valid_image
load_image)ProcessingKwargsProcessorMixinUnpack!_validate_images_text_input_order)PreTokenizedInput	TextInput)is_torch_deviceis_torch_dtypeis_torch_tensorloggingrequires_backendsc                   "    e Zd Zddii ddidZdS )PixtralProcessorKwargspaddingFreturn_tensorspt)text_kwargsimages_kwargscommon_kwargsN)__name__
__module____qualname__	_defaults     j/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/pixtral/processing_pixtral.pyr   r      s5         u
 d
 IIIr"   r   F)totalreturnc                 V    t          | t                    o|                     d          S )Nhttp)
isinstancestr
startswith)vals    r#   is_urlr,   ,   s#    c3:CNN6$:$::r"   c                 >    t          |           pt          |           S N)r,   r   )elems    r#   is_image_or_image_urlr0   1   s    $<</>$///r"   c                       e Zd ZddZdS )BatchMixFeaturer%   c                    t          | dg           ddl}i }                    d          }|t                    dk    rtd         }t	          |          rn\t          |t                    s$t          |          st          |t                    r|}n t          dt          |           d          | 
                                D ]\  }}t          |t                    rfd|D             ||<   -t          ||j                  r&|                    |          r |j        i ||<   ht          ||j                  r||                    |          ||<   |||<   || _        | S )	a  
        Send all values to device by calling `v.to(*args, **kwargs)` (PyTorch only). This should support casting in
        different `dtypes` and sending the `BatchFeature` to a different `device`.

        Args:
            args (`Tuple`):
                Will be passed to the `to(...)` function of the tensors.
            kwargs (`Dict`, *optional*):
                Will be passed to the `to(...)` function of the tensors.

        Returns:
            [`BatchFeature`]: The same instance after modification.
        torchr   Ndevicez*Attempting to cast a BatchFeature to type z. This is not supported.c                 R    g | ]#}|D ]}t          |           |j        i $S r!   )r   to).0sampleelementargskwargss      r#   
<listcomp>z&BatchMixFeature.to.<locals>.<listcomp>Z   s]       4:v DKYhipYqYqGJ///   r"   )r5   )r   r4   getlenr   r(   r)   r   int
ValueErroritemslistTensoris_floating_pointr7   data)	selfr;   r<   r4   new_datar5   argkvs	    ``      r#   r7   zBatchMixFeature.to7   s    	$	***H%%>c$ii!mmq'Cc"" rC%% r)=)= rCQTAUAU r !!pcRUhh!p!p!pqqqJJLL 	  	 DAq!T"" 
     >?   Au|,,  1H1H1K1K  "adD3F33Au|,,  1Cdd&d11	r"   N)r%   r2   )r   r   r   r7   r!   r"   r#   r2   r2   6   s(        . . . . . .r"   r2   c            
            e Zd ZdZddgZg dZdZdZ	 	 	 	 	 	 	 ddef fdZ		 	 	 	 dde
deeeee         ee         f         dee         defdZd Zd Zed             Z xZS )PixtralProcessora  
    Constructs a Pixtral processor which wraps a Pixtral image processor and a Pixtral tokenizer into a single processor.

    [`PixtralProcessor`] offers all the functionalities of [`CLIPImageProcessor`] and [`LlamaTokenizerFast`]. See the
    [`~PixtralProcessor.__call__`] and [`~PixtralProcessor.decode`] for more information.

    Args:
        image_processor ([`PixtralImageProcessor`], *optional*):
            The image processor is a required input.
        tokenizer ([`LlamaTokenizerFast`], *optional*):
            The tokenizer is a required input.
        patch_size (`int`, *optional*, defaults to 16):
            Patch size from the vision tower.
        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
            in a chat into a tokenizable string.
        image_token (`str`, *optional*, defaults to `"[IMG]"`):
            Special token used to denote image location.
        image_break_token (`str`, *optional*, defaults to `"[IMG_BREAK]"`):
            Special token used to denote the end of a line of pixels in an image.
        image_end_token (`str`, *optional*, defaults to `"[IMG_END]"`):
            Special token used to denote the end of an image input.
    image_processor	tokenizer)chat_template
patch_sizeimage_tokenimage_break_tokenimage_end_tokenAutoImageProcessorAutoTokenizerN   [IMG][IMG_BREAK]	[IMG_END]rQ   c                     || _         || _        || _        || _        t	                                          |||           d S )N)rP   )rQ   rR   rS   rT   super__init__)
rG   rN   rO   rQ   rP   rR   rS   rT   r<   	__class__s
            r#   r]   zPixtralProcessor.__init__   sI     %&!2.)=QQQQQr"   imagestextr<   r%   c                 p   t          ||          \  }} | j        t          fd| j        j        i|}|t          |          r|gg}nt          |t                    rt          |d                   r|g}nZt          |t                    sEt          |d         t                    s*t          |d         d                   st          d          d |D             } | j	        |fd| j
        i|d         }ni }t          |t                    r|g}n?t          |t                    s*t          |d         t                    st          d          |}|                    d	          %|d	         }|                    d
          }	g }t          ||	|          D ]\  }
}}g }t          |
|          D ]\  }}|\  }}|| j
        z  }|| j
        z  }| j        g|z  | j        gz   g|z  }d |D             }| j        |d<   d                    |          }|                    |           |                    | j        dd          }d|v r0|                    d          }|                    d|d          }d|v 0|                    |            | j        |fi |d         }t+          i ||          S )a  
        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
        of the above two methods for more information.

        Args:
            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                tensor. Both channels-first and channels-last formats are supported.
            text (`str`, `List[str]`, `List[List[str]]`):
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors of a particular framework. Acceptable values are:

                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return NumPy `np.ndarray` objects.
                - `'jax'`: Return JAX `jnp.ndarray` objects.

        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
            `None`).
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
        tokenizer_init_kwargsNr   zdInvalid input images. Please provide a single image or a list of images or a list of list of images.c                 &    g | ]}d  |D             S )c                 ,    g | ]}t          |          S r!   )r	   )r8   ims     r#   r=   z8PixtralProcessor.__call__.<locals>.<listcomp>.<listcomp>   s    777"z"~~777r"   r!   )r8   r9   s     r#   r=   z-PixtralProcessor.__call__.<locals>.<listcomp>   s'    MMMF77777MMMr"   rQ   r   zAInvalid input text. Please provide a string, or a list of stringspixel_valuesimage_sizesc                     g | ]	}|D ]}|
S r!   r!   )r8   sublistitems      r#   r=   z-PixtralProcessor.__call__.<locals>.<listcomp>   s&    %]%]%]wU\%]%]Td%]%]%]%]r"    z<placeholder>   r   )rF   )r   _merge_kwargsr   rO   init_kwargsr0   r(   rC   rA   rN   rQ   r)   r>   popziprR   rS   rT   joinappendreplacer2   )rG   r_   r`   audiovideosr<   output_kwargsimage_inputsprompt_stringsrg   sample_imagessample_image_sizesr9   replace_stringsimage
image_sizeheightwidthnum_height_tokensnum_width_tokensreplace_tokensreplace_strtext_inputss                          r#   __call__zPixtralProcessor.__call__   sW   R 9FF**"
 
"&."<
 
 
 $V,, !(FD)) 	.CF1I.N.N 	 vt,,"6!9d33 .fQil;;
 !z   NMfMMMF/4/uu4?uVcdsVtuuLLLdC   	b6DDD$'' 	b
47C0H0H 	b`aaa N++7!.1F&**=99KN=@VZ=[=[ . .916"$),]<N)O)O R R%E:$.MFE(.$/(A%','?$)*-==AW@XX&)&*N &^%].%]%]%]N)-)=N2&"$''."9"9K#**;777#^^D,<oqQQFF%//"1"5"5a"8"8K#^^O[!LLF &// %%f----$dn^TT}]7STT$C{$Cl$CDDDDr"   c                 &     | j         j        |i |S )z
        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
        refer to the docstring of this method for more information.
        )rO   batch_decoderG   r;   r<   s      r#   r   zPixtralProcessor.batch_decode  s    
 +t~*D;F;;;r"   c                 &     | j         j        |i |S )z
        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
        the docstring of this method for more information.
        )rO   decoder   s      r#   r   zPixtralProcessor.decode  s    
 %t~$d5f555r"   c                     | j         j        }| j        j        }t          t                              ||z                       S r.   )rO   model_input_namesrN   rC   dictfromkeys)rG   tokenizer_input_namesimage_processor_input_namess      r#   r   z"PixtralProcessor.model_input_names  s<     !% @&*&:&L#DMM"7:U"UVVWWWr"   )NNrW   NrX   rY   rZ   )NNNN)r   r   r   __doc__
attributesvalid_kwargsimage_processor_classtokenizer_classr@   r]   r   r   r   r   r   r   r   r2   r   r   r   propertyr   __classcell__)r^   s   @r#   rM   rM   h   s^        . $[1J  L 1%O '#R R 	R R R R R R& "^bgE gEgE I0$y/4HYCZZ[gE /0gE 
gE gE gE gET< < <6 6 6 X X XX X X X Xr"   rM   N)!r   typingr   r   feature_extraction_utilsr   image_utilsr   r   r	   processing_utilsr
   r   r   r   tokenization_utils_baser   r   utilsr   r   r   r   r   
get_loggerr   loggerr   boolr,   r0   r2   rM   r!   r"   r#   <module>r      s            4 4 4 4 4 4 A A A A A A A A A A k k k k k k k k k k k k C C C C C C C C a a a a a a a a a a a a a a 
	H	%	%	 	 	 	 	-U 	 	 	 	;4 ; ; ; ;
0 0 0
/ / / / /l / / /drX rX rX rX rX~ rX rX rX rX rXr"   