
    g                     b    d Z ddlmZmZ ddlmZ ddlmZ ddlm	Z	m
Z
mZ  G d de          Zd	S )
z
Processor class for MarkupLM.
    )OptionalUnion   )
TensorType)ProcessorMixin)BatchEncodingPaddingStrategyTruncationStrategyc                       e Zd ZdZdZdZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dded	eee	e
f         d
eee	ef         dee         dedee         dee         dee         dedededededeee	ef                  defdZd Zd Zed             ZdS )MarkupLMProcessoraJ  
    Constructs a MarkupLM processor which combines a MarkupLM feature extractor and a MarkupLM tokenizer into a single
    processor.

    [`MarkupLMProcessor`] offers all the functionalities you need to prepare data for the model.

    It first uses [`MarkupLMFeatureExtractor`] to extract nodes and corresponding xpaths from one or more HTML strings.
    Next, these are provided to [`MarkupLMTokenizer`] or [`MarkupLMTokenizerFast`], which turns them into token-level
    `input_ids`, `attention_mask`, `token_type_ids`, `xpath_tags_seq` and `xpath_subs_seq`.

    Args:
        feature_extractor (`MarkupLMFeatureExtractor`):
            An instance of [`MarkupLMFeatureExtractor`]. The feature extractor is a required input.
        tokenizer (`MarkupLMTokenizer` or `MarkupLMTokenizerFast`):
            An instance of [`MarkupLMTokenizer`] or [`MarkupLMTokenizerFast`]. The tokenizer is a required input.
        parse_html (`bool`, *optional*, defaults to `True`):
            Whether or not to use `MarkupLMFeatureExtractor` to parse HTML strings into nodes and corresponding xpaths.
    MarkupLMFeatureExtractor)MarkupLMTokenizerMarkupLMTokenizerFastTNFr   add_special_tokenspadding
truncation
max_lengthstridepad_to_multiple_ofreturn_token_type_idsreturn_attention_maskreturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_lengthverbosereturn_tensorsreturnc                    | j         rL|t          d          |||t          d          |                     |          }|d         }|d         }n$|t          d          ||t          d          || j         rt          |t                    r|g} | j        di d||n|d	||ndd|d
|d|d|d|d|	d|
d|d|d|d|d|d|d|d|d||}|S )a  
        This method first forwards the `html_strings` argument to [`~MarkupLMFeatureExtractor.__call__`]. Next, it
        passes the `nodes` and `xpaths` along with the additional arguments to [`~MarkupLMTokenizer.__call__`] and
        returns the output.

        Optionally, one can also provide a `text` argument which is passed along as first sequence.

        Please refer to the docstring of the above two methods for more information.
        NzDMake sure to pass HTML strings in case `parse_html` is set to `True`zUPlease don't pass nodes, xpaths nor node labels in case `parse_html` is set to `True`nodesxpathsz@You have passed HTML strings but `parse_html` is set to `False`.zIMake sure to pass nodes and xpaths in case `parse_html` is set to `False`text	text_pairnode_labelsr   r   r   r   r   r   r   r   r   r   r   r   r   r    )
parse_html
ValueErrorfeature_extractor
isinstancestr	tokenizer)selfhtml_stringsr    r!   r$   	questionsr   r   r   r   r   r   r   r   r   r   r   r   r   r   kwargsfeaturesencoded_inputss                          l/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/markuplm/processing_markuplm.py__call__zMarkupLMProcessor.__call__2   s   B ? 	n# !ghhh F$6+:Q k   --l;;HW%Eh'FF' !cddd} !lmmm  T_ )S)) (&K	' 
 
 
'3
(4ee$
 6
 $	

  21
 G
 "z
 "z
 6
  21
 #8"7
 #8"7
 '@&?
 (B'A
 $:#9
  (-!
" G#
$ *>'
 
,     c                 &     | j         j        |i |S )z
        This method forwards all its arguments to TrOCRTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please refer
        to the docstring of this method for more information.
        )r+   batch_decoder,   argsr/   s      r2   r6   zMarkupLMProcessor.batch_decode   s    
 +t~*D;F;;;r4   c                 &     | j         j        |i |S )z
        This method forwards all its arguments to TrOCRTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to the
        docstring of this method for more information.
        )r+   decoder7   s      r2   r:   zMarkupLMProcessor.decode   s    
 %t~$d5f555r4   c                     | j         j        }|S )N)r+   model_input_names)r,   tokenizer_input_namess     r2   r<   z#MarkupLMProcessor.model_input_names   s     $ @$$r4   )NNNNNTFNNr   NNNFFFFTN)__name__
__module____qualname____doc__feature_extractor_classtokenizer_classr&   boolr   r*   r	   r
   r   intr   r   r3   r6   r:   propertyr<   r%   r4   r2   r   r      s        & 9DOJ #'5:;?$(,00404*/+0',#;?)N N !N tS/12N $%778N SMN N %SMN  (~N  (~N $(N  %)!N" !%#N$ %N& 'N( !sJ!78)N, 
-N N N N`< < <6 6 6 % % X% % %r4   r   N)rA   typingr   r   
file_utilsr   processing_utilsr   tokenization_utils_baser   r	   r
   r   r%   r4   r2   <module>rK      s     # " " " " " " " $ $ $ $ $ $ . . . . . . Y Y Y Y Y Y Y Y Y Yy% y% y% y% y% y% y% y% y% y%r4   