
    g]                        d dl Z d dlmZmZmZmZ d dlZddlm	Z	m
Z
mZmZmZmZ ddlmZmZ ddlmZ  e            rd dlmZ dd	lmZ  e            r
d dlZdd
lmZ dZ e            rdZd dlZ ej        e          Zd Z dddee!         dee!         fdZ" G d de	          Z# e
 edd                     G d de                      Z$dS )    N)ListOptionalTupleUnion   )ExplicitEnumadd_end_docstringsis_pytesseract_availableis_torch_availableis_vision_availablelogging   )ChunkPipelinebuild_pipeline_init_args)select_starts_ends)Image)
load_image)3MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMESFTc                     t          d| d         |z  z            t          d| d         |z  z            t          d| d         |z  z            t          d| d         |z  z            gS )N  r   r   r      )int)boxwidthheights      n/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/pipelines/document_question_answering.pynormalize_boxr   5   sl    DCFUN#$$DCFVO$%%DCFUN#$$DCFVO$%%	     imageImage.Imagelangtesseract_configc                    t          j        | |d|          }|d         |d         |d         |d         |d         f\  }}}}}d t          |          D             fd	t          |          D             }fd
t          |          D             }fdt          |          D             }fdt          |          D             }fdt          |          D             }g }	t          ||||          D ](\  }
}}}|
||
|z   ||z   g}|	                    |           )| j        \  }}g }|	D ]&}|                    t          |||                     't          |          t          |          k    rt          d          ||fS )zdApplies Tesseract OCR on a document image, and returns recognized words + normalized bounding boxes.dict)r!   output_typeconfigtextlefttopr   r   c                 @    g | ]\  }}|                                 |S  )strip).0idxwords      r   
<listcomp>z#apply_tesseract.<locals>.<listcomp>E   s)    TTT)#ttzz||T#TTTr   c                 "    g | ]\  }}|v	|S r+   r+   )r-   r.   r/   irrelevant_indicess      r   r0   z#apply_tesseract.<locals>.<listcomp>F   s(    UUUic4sBT7T7TT7T7T7Tr   c                 "    g | ]\  }}|v	|S r+   r+   r-   r.   coordr2   s      r   r0   z#apply_tesseract.<locals>.<listcomp>G   s(    UUUjc5sBT7T7TE7T7T7Tr   c                 "    g | ]\  }}|v	|S r+   r+   r4   s      r   r0   z#apply_tesseract.<locals>.<listcomp>H   s(    
S
S
SZS%S@R5R5R55R5R5Rr   c                 "    g | ]\  }}|v	|S r+   r+   r4   s      r   r0   z#apply_tesseract.<locals>.<listcomp>I   s(    WWWzsEDV9V9VU9V9V9Vr   c                 "    g | ]\  }}|v	|S r+   r+   r4   s      r   r0   z#apply_tesseract.<locals>.<listcomp>J   s(    YYY
U3FX;X;Xe;X;X;Xr   z-Not as many words as there are bounding boxes)	pytesseractimage_to_data	enumeratezipappendsizer   len
ValueError)r   r!   r"   datawordsr(   r)   r   r   actual_boxesxywh
actual_boximage_widthimage_heightnormalized_boxesr   r2   s                      @r   apply_tesseractrL   >   s    $U6RbcccD&*6lDL$u+tT[}^bck^l&l#E4eV UTy/?/?TTTUUUU9U#3#3UUUEUUUUIdOOUUUD
S
S
S
S9S>>
S
S
SCWWWWYu%5%5WWWEYYYYi&7&7YYYF L$UF33 ( (
1aAE1q5)
J'''' %
K  O Oc; M MNNNN
5zzS)****HIII"""r   c                       e Zd ZdZdZdZdS )	ModelTypelayoutlmlayoutlmv2andv3vision_encoder_decoderN)__name__
__module____qualname__LayoutLMLayoutLMv2andv3VisionEncoderDecoderr+   r   r   rN   rN   _   s         H'O3r   rN   )has_image_processorhas_tokenizerc            
           e Zd ZdZ fdZ	 	 	 	 	 	 	 	 	 	 ddee         dee         fdZ	 	 ddedef         d	ee         d
e	ee
e         f         f fdZ	 	 	 	 	 	 	 dd
e	ee
e         f         fdZd ZddZd Z	 ddZ xZS )!DocumentQuestionAnsweringPipelinea  
    Document Question Answering pipeline using any `AutoModelForDocumentQuestionAnswering`. The inputs/outputs are
    similar to the (extractive) question answering pipeline; however, the pipeline takes an image (and optional OCR'd
    words/boxes) as input instead of text context.

    Example:

    ```python
    >>> from transformers import pipeline

    >>> document_qa = pipeline(model="impira/layoutlm-document-qa")
    >>> document_qa(
    ...     image="https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png",
    ...     question="What is the invoice number?",
    ... )
    [{'score': 0.425, 'answer': 'us-001', 'start': 16, 'end': 16}]
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

    This document question answering pipeline can currently be loaded from [`pipeline`] using the following task
    identifier: `"document-question-answering"`.

    The models that this pipeline can use are models that have been fine-tuned on a document question answering task.
    See the up-to-date list of available models on
    [huggingface.co/models](https://huggingface.co/models?filter=document-question-answering).
    c                 0    t                      j        |i | | j        F| j        j        j                            d          s"t          d| j        j        j         d          | j        j        j        j        dk    r<t          j
        | _        | j        j        j        j        dk    rt          d          d S |                     t                     | j        j        j        j        dk    rt          j        | _        d S t          j        | _        d S )NFastzV`DocumentQuestionAnsweringPipeline` requires a fast tokenizer, but a slow tokenizer (`z`) is provided.VisionEncoderDecoderConfigz
donut-swinzACurrently, the only supported VisionEncoderDecoder model is DonutLayoutLMConfig)super__init__	tokenizer	__class__rR   endswithr@   modelr&   rN   rW   
model_typeencodercheck_model_typer   rU   rV   )selfargskwargsrc   s      r   ra   z*DocumentQuestionAnsweringPipeline.__init__   s   $)&)))>%dn.F.O.X.XY_.`.`%H^-6H H H  
 :&/3OOO'<DOz (3|CC !deee DC !!"UVVVz *37GGG"+"4"+";r   Nr!   r"   c                    i i }}|||d<   |||d<   |||d<   |||d<   |||d<   |||d<   |
|
|d<   ||dk     rt          d	| d
          ||d<   ||dk     rt          d|           ||d<   |	|	|d<   |i |fS )Npadding
doc_stridemax_question_lenmax_seq_lenr!   r"   timeoutr   z$top_k parameter should be >= 1 (got )top_kz-max_answer_len parameter should be >= 1 (got max_answer_lenhandle_impossible_answer)r@   )ri   rm   rn   ro   r!   r"   rt   rp   rs   ru   rq   rk   preprocess_paramspostprocess_paramss                 r   _sanitize_parametersz6DocumentQuestionAnsweringPipeline._sanitize_parameters   s    13B-+2i(!.8l+'4D01"/:m,(,f%'4D01+2i(qyy !P!P!P!PQQQ*/w'%!! !aQ_!a!abbb3A/0#/=U9: "&888r   r   r    question
word_boxesc                     t          |t                    r||d}|||d<   n|} t                      j        |fi |S )a  
        Answer the question(s) given as inputs by using the document(s). A document is defined as an image and an
        optional list of (word, box) tuples which represent the text in the document. If the `word_boxes` are not
        provided, it will use the Tesseract OCR engine (if available) to extract the words and boxes automatically for
        LayoutLM-like models which require them as input. For Donut, no OCR is run.

        You can invoke the pipeline several ways:

        - `pipeline(image=image, question=question)`
        - `pipeline(image=image, question=question, word_boxes=word_boxes)`
        - `pipeline([{"image": image, "question": question}])`
        - `pipeline([{"image": image, "question": question, "word_boxes": word_boxes}])`

        Args:
            image (`str` or `PIL.Image`):
                The pipeline handles three types of images:

                - A string containing a http link pointing to an image
                - A string containing a local path to an image
                - An image loaded in PIL directly

                The pipeline accepts either a single image or a batch of images. If given a single image, it can be
                broadcasted to multiple questions.
            question (`str`):
                A question to ask of the document.
            word_boxes (`List[str, Tuple[float, float, float, float]]`, *optional*):
                A list of words and bounding boxes (normalized 0->1000). If you provide this optional input, then the
                pipeline will use these words and boxes instead of running OCR on the image to derive them for models
                that need them (e.g. LayoutLM). This allows you to reuse OCR'd results across many invocations of the
                pipeline without having to re-run it each time.
            top_k (`int`, *optional*, defaults to 1):
                The number of answers to return (will be chosen by order of likelihood). Note that we return less than
                top_k answers if there are not enough options available within the context.
            doc_stride (`int`, *optional*, defaults to 128):
                If the words in the document are too long to fit with the question for the model, it will be split in
                several chunks with some overlap. This argument controls the size of that overlap.
            max_answer_len (`int`, *optional*, defaults to 15):
                The maximum length of predicted answers (e.g., only answers with a shorter length are considered).
            max_seq_len (`int`, *optional*, defaults to 384):
                The maximum length of the total sentence (context + question) in tokens of each chunk passed to the
                model. The context will be split in several chunks (using `doc_stride` as overlap) if needed.
            max_question_len (`int`, *optional*, defaults to 64):
                The maximum length of the question after tokenization. It will be truncated if needed.
            handle_impossible_answer (`bool`, *optional*, defaults to `False`):
                Whether or not we accept impossible as an answer.
            lang (`str`, *optional*):
                Language to use while running OCR. Defaults to english.
            tesseract_config (`str`, *optional*):
                Additional flags to pass to tesseract while running OCR.
            timeout (`float`, *optional*, defaults to None):
                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
                the call may block forever.

        Return:
            A `dict` or a list of `dict`: Each result comes as a dictionary with the following keys:

            - **score** (`float`) -- The probability associated to the answer.
            - **start** (`int`) -- The start word index of the answer (in the OCR'd version of the input or provided
              `word_boxes`).
            - **end** (`int`) -- The end word index of the answer (in the OCR'd version of the input or provided
              `word_boxes`).
            - **answer** (`str`) -- The answer to the question.
            - **words** (`list[int]`) -- The index of each word/box pair that is in the answer
        )ry   r   Nrz   )
isinstancestrr`   __call__)ri   r   ry   rz   rk   inputsrc   s         r   r~   z*DocumentQuestionAnsweringPipeline.__call__   sZ    N h$$ 	"*U;;F%'1|$Fuww11&111r   
do_not_pad c	           
   #   4
  K   || j         j        }|t          |dz  d          }d }	i }
|                    dd           t	          |d         |          }	| j        W|                     |	| j                  }| j        dk    r|                    | j                  }|
	                    |           n[| j
        0|
	                    | 
                    |	| j                             n$| j        t          j        k    rt          d          d\  }}| j        t          j        k    sd	|v r%d
 |d	         D             }d |d	         D             }nd|
v r;d|
v r7|
                    d          d         }|
                    d          d         }nD|	3t           st          d          t           rt#          |	||          \  }}nt          d          | j         j        dk    rt          d| j         j                   | j        t          j        k    rHd|d          d}|
d         |                      |d| j                  j        ddi d d d dddV  d S i }| j        t          j        k    r(|d                                         |d<   ||d<   d|d<   n|d         g|d<   |g|d<   |g|d<    | j         d-|||dd dd!|                    d"d            t-          d#                   }t/          j        fd$t3          |          D                       }t3          |          D ]| j        dk    r0fd%                                D             }d|
v r|
d         |d<   nt          d&          d#                  }| j         j        Jt/          j        t/          j        |          | j         j        k              d         }|D ]}d|         |<   d|vr g }t;          j                                                                                       D ]k\  }}}|d'k    r|                     ||                    (|| j         j!        k    r|                     d(gd)z             R|                     dgd)z             l| j        dk    r+tE          j#        |          $                    d          |d*<   n| j        d+k    rt          d&          i ||                                       ||d'z
  k    d,V  d S ).Nr      r   )rq   )imagesreturn_tensorsptzRIf you are using a VisionEncoderDecoderModel, you must provide a feature extractorNNrz   c                     g | ]
}|d          S )r   r+   r-   rD   s     r   r0   z@DocumentQuestionAnsweringPipeline.preprocess.<locals>.<listcomp>5      ;;;!1;;;r   c                     g | ]
}|d          S r   r+   r   s     r   r0   z@DocumentQuestionAnsweringPipeline.preprocess.<locals>.<listcomp>6  r   r   rB   boxesr   z|If you provide an image without word_boxes, then the pipeline will run OCR using Tesseract, but pytesseract is not available)r!   r"   zYou must provide an image or word_boxes. If you provide an image, the pipeline will automatically run OCR to derive words and boxesrightzXDocument question answering only supports tokenizers whose padding side is 'right', not z<s_docvqa><s_question>ry   z</s_question><s_answer>pixel_valuesF)add_special_tokensr   T)r   decoder_input_idsreturn_dict_in_generate)p_maskword_idsrB   output_attentionsis_lastr'   	text_pairis_split_into_wordsonly_second)rm   
max_lengthstridereturn_token_type_ids
truncationreturn_overflowing_tokensoverflow_to_sample_mapping	input_idsc                 N    g | ]!}d                       |          D             "S )c                     g | ]}|d k    	S r   r+   )r-   toks     r   r0   zKDocumentQuestionAnsweringPipeline.preprocess.<locals>.<listcomp>.<listcomp>}  s    SSSSqSSSr   )sequence_ids)r-   span_idencodings     r   r0   z@DocumentQuestionAnsweringPipeline.preprocess.<locals>.<listcomp>}  s7    tttX_SSH4I4I'4R4RSSStttr   c           	      V    i | ]%\  }}|t          j        |d z                      &S r   )torchtensor)r-   kvspan_idxs      r   
<dictcomp>z@DocumentQuestionAnsweringPipeline.preprocess.<locals>.<dictcomp>  s:    $p$p$pU[VWYZQQx(Q,7N5O(P(P$p$p$pr   zKUnsupported: Tensorflow preprocessing for DocumentQuestionAnsweringPipeliner   r      bboxtf)r   r   rB   r   r+   )%rb   model_max_lengthmingetr   image_processor	frameworktotorch_dtypeupdatefeature_extractorrf   rN   rW   r@   popTESSERACT_LOADEDrL   padding_sider   rU   splitr?   nparrayrangeitemscls_token_idnonzeror<   r   r   r=   sep_token_idr   r   	unsqueeze)ri   inputrm   rn   rp   rz   r!   r"   rq   r   image_featuresimage_inputsrB   r   task_prompttokenizer_kwargs	num_spansr   span_encodinginput_ids_span_idxcls_indices	cls_indexr   input_idsequence_idword_idr   r   s                             @@r   
preprocessz,DocumentQuestionAnsweringPipeline.preprocess  s^      .9K[A-s33J99Wd##/uW~w???E#/#335QUQ_3``>T))#/??43C#D#DL%%l3333'3%%d&<&<EZ^Zh&<&i&ijjjjI$BBB !uvvv!u)"@@@u$$;;u\':;;;;;u\':;;;N**w./H/H&**733A6&**733A6"' $<   $ h#25tVf#g#g#gLE5 9  
 >&'112N/2 2  
 ?i<<<]53D]]]K )8%)^^E$. &4 & &+/ H %)         ")"444+0+<+B+B+D+D (05 -:> !677,1*,=+> (16 --2G )%t~ &!&*(*.  # H LL5t<<<H[122I
 XttttchircscstttuuF!),, ) )>T))$p$p$p$p_g_m_m_o_o$p$p$pM%771?1Og.$%rsss%-k%:8%D">.:"$*RX6H-I-IT^Mh-h"i"ijk"lK%0 8 8	67x(33 "222D:= *84 --h77 ))(33; ; 
1 
16+w
 '!++ KKg7777%)DDD KK
3333 KKa0000~--05T0B0B0L0LQ0O0Of--4//()vwww#$X. ( 1 1( ; ;"'9q=8      G) )r   c                    |                     dd           }|                     dd           }|                     dd           }|                     dd          }| j        t          j        k    r#d|vr
| j        |d<    | j        j        di ||}n | j        di |}t          |                                          }||d<   ||d<   ||d<   |	                    dd           |d<   ||d<   |S )	Nr   r   rB   r   Fgeneration_configattention_maskr+   )
r   rf   rN   rW   r   re   generater$   r   r   )ri   model_inputsgenerate_kwargsr   r   rB   r   model_outputss           r   _forwardz*DocumentQuestionAnsweringPipeline._forward  s   !!(D11##J55  $//""9e44?i<<<"/997;7M 34/DJ/RR,R/RRMM&DJ6666M]002233"(h$,j!!&g*6*:*:;KT*R*R&'#*i r   r   c                       j         t          j        k    r fd|D             }n  j        |fd|i|}t	          |d d          d |         }|S )Nc                 :    g | ]}                     |          S r+   )"postprocess_encoder_decoder_single)r-   ori   s     r   r0   zADocumentQuestionAnsweringPipeline.postprocess.<locals>.<listcomp>  s'    YYYat>>qAAYYYr   rs   c                 .    |                      dd          S )Nscorer   )r   )rD   s    r   <lambda>z?DocumentQuestionAnsweringPipeline.postprocess.<locals>.<lambda>  s    gq0A0A r   T)keyreverse)rf   rN   rW   postprocess_extractive_qasorted)ri   r   rs   rk   answerss   `    r   postprocessz-DocumentQuestionAnsweringPipeline.postprocess  sw    ?i<<<YYYY=YYYGG4d4]ZZ%ZSYZZG&A&A4PPPQWRWQWXr   c                    | j                             |d                   d         }|                    | j         j        d                              | j         j        d          }t          j        dd|d                                          }dd i}t          j        d|          }|*|	                    d                                          |d<   |S )	N	sequencesr   r   z<.*?>r   )countanswerz<s_answer>(.*)</s_answer>)
rb   batch_decodereplace	eos_token	pad_tokenresubr,   searchgroup)ri   r   rk   sequenceretr   s         r   r   zDDocumentQuestionAnsweringPipeline.postprocess_encoder_decoder_single  s    >..}[/IJJ1M ##DN$<bAAII$.Jbdfgg6(B:::@@BBd
 7BB"LLOO1133CM
r   F   c                    d}g }|D ]}|d         }	t          |d         |d         |d         |                    dd           |d                                         nd ||||          \  }
}}}|d         }t          |
||          D ]_\  }}}||         ||         }}|G|E|                    t          |          d	                    |	||d
z                      ||d           `|r|                    |dddd           |S )Ni@B rB   start_logits
end_logitsr   r   )startendr   r   min_null_scorers   ru   rt   r    r   )r   r   r   r   r   r   )r   r   numpyr<   r=   floatjoin)ri   r   rs   ru   rt   rk   r   r   outputrB   startsendsscoresr   r   r   r   
word_startword_ends                      r   r   z;DocumentQuestionAnsweringPipeline.postprocess_extractive_qa  sd    !# 	 	F7OE3E^,<(h'::.55A  &&67==???-)A-4 4 40FD&. j)H%(v%>%> 
 
!sE'/H
)h.BNN%*5\\&)hhuZ(Q,5N/O&P&P%/#+	   
 $ 	ZNN^rAVWXXYYYr   )
NNNNNNNNNNr   )r   NNNNr   Nr   )r   Fr   )rR   rS   rT   __doc__ra   r   r}   rx   r   r   r   r   r~   r   r   r   r   r   __classcell__)rc   s   @r   r[   r[   e   s        8< < < < <* "*.!%)9 )9
 sm)9 #3-)9 )9 )9 )9\ #'.2	M2 M2]C'(M2 3-M2 #tE{*+	M2 M2 M2 M2 M2 M2d .2V V #tE{*+V V V Vp  .     " VX$ $ $ $ $ $ $ $r   r[   )%r   typingr   r   r   r   r   r   utilsr   r	   r
   r   r   r   baser   r   question_answeringr   PILr   image_utilsr   r   models.auto.modeling_autor   r   r9   
get_loggerrR   loggerr   r}   rL   rN   r[   r+   r   r   <module>r     s2   
			 / / / / / / / / / / / /                    : 9 9 9 9 9 9 9 2 2 2 2 2 2  )(((((( `LLL______  		H	%	%  #= # #QYZ]Q^ # # # #B4 4 4 4 4 4 4 4 ,,UYZZZ[[W W W W W W W \[W W Wr   