
    g$                        d dl Z d dlZd dlmZmZ d dlmZ d dlmZm	Z	m
Z
mZ d dlZd dlmZ d dlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZ  ej        e          Z e ej                               Z! e"d e!D                       Z#e G d d                      Z$ G d de          Z% G d de          Z&dS )    N)	dataclassfield)Enum)DictListOptionalUnion)FileLock)Dataset   )$MODEL_FOR_QUESTION_ANSWERING_MAPPING)PreTrainedTokenizer)logging   )SquadFeaturesSquadV1ProcessorSquadV2Processor"squad_convert_examples_to_featuresc              #   $   K   | ]}|j         V  d S N)
model_type).0confs     \/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/data/datasets/squad.py	<genexpr>r   "   s$      EEDOEEEEEE    c                       e Zd ZU dZ edddd                    e          z   i          Zee	d<    edddi          Z
ee	d	<    ed
ddi          Zee	d<    ed
ddi          Zee	d<    edddi          Zee	d<    edddi          Zee	d<    edddi          Zee	d<    edddi          Zee	d<    edddi          Zee	d<    edddi          Zee	d<    eddd i          Zee	d!<    ed"dd#i          Zee	d$<   dS )%SquadDataTrainingArgumentszb
    Arguments pertaining to what data we are going to input our model for training and eval.
    Nhelpz!Model type selected in the list: z, )defaultmetadatar   zFThe input data dir. Should contain the .json files for the SQuAD task.data_dir   zThe maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.max_seq_lengthzVWhen splitting up a long document into chunks, how much stride to take between chunks.
doc_stride@   zkThe maximum number of tokens for the question. Questions longer than this will be truncated to this length.max_query_length   zThe maximum length of an answer that can be generated. This is needed because the start and end predictions are not conditioned on one another.max_answer_lengthFz1Overwrite the cached training and evaluation setsoverwrite_cachezDIf true, the SQuAD examples contain some that do not have an answer.version_2_with_negativeg        zIIf null_score - best_non_null is greater than the threshold predict null.null_score_diff_threshold   n_best_sizer   zjlanguage id of input for language-specific xlm models (see tokenization_xlm.PRETRAINED_INIT_CONFIGURATION)lang_id   z3multiple threads for converting example to featuresthreads)__name__
__module____qualname____doc__r   joinMODEL_TYPESr   str__annotations__r"   r$   intr%   r'   r)   r*   boolr+   r,   floatr.   r/   r1    r   r   r   r   %   s          e(KdiiXcNdNd(de  J    E(pq  Hc     %Q
  NC    ers  J    "E/
  c    #UJ
  s    "E)\ ]  OT    %*E)o p% % %T    (-uv'rs( ( (u    uf&qr  K    5C
  GS    5f6k-lmmmGSmmmmmr   r   c                       e Zd ZdZdZdS )SplittraindevN)r2   r3   r4   r@   rA   r=   r   r   r?   r?   h   s        E
CCCr   r?   c                       e Zd ZU dZeed<   ee         ed<   eed<   e	ed<   dej
        dddfded	ed
ee         deeef         dee	         dee         dee         fdZd Zdeeej        f         fdZdS )SquadDatasetzH
    This will be superseded by a framework-agnostic approach soon.
    argsfeaturesmodeis_language_sensitiveNFpt	tokenizerlimit_length	cache_dirdataset_formatc                 8   || _         || _        |j        rt                      nt	                      | _        t          |t                    r,	 t          |         }n# t          $ r t          d          w xY w|| _
        |j        rdnd}t          j                            ||n|j        d|j         d|j        j         d|j         d|           }	|	dz   }
t'          |
          5  t          j                            |	          r|j        st-          j                    }t/          j        |	          | _        | j        d         | _        | j                            dd           | _        | j                            d	d           | _        t<                              d
|	 dt-          j                    |z
             | j        | j        t<                               d|	 d           n|t          j!        k    r%| j        "                    |j                  | _        n$| j        #                    |j                  | _        tI          | j        ||j        |j%        |j&        |t          j'        k    |j(        |          \  | _        | _        t-          j                    }t/          j)        | j        | j        | j        d|	           t<                              d|	 dt-          j                    |z
  dd           d d d            d S # 1 swxY w Y   d S )Nzmode is not a valid split namev2v1cached__z.lockrE   datasetexamplesz"Loading features from cached file z [took %.3f s]zDeleting cached file z; will allow dataset and examples to be cached in future run)rS   rI   r$   r%   r'   is_trainingr1   return_dataset)rE   rR   rS   z!Saving features into cached file z [took z.3fz s])*rD   rG   r+   r   r   	processor
isinstancer8   r?   KeyErrorrF   ospathr6   r"   value	__class__r2   r$   r
   existsr*   timetorchloadold_featuresrE   getrR   rS   loggerinfowarningrA   get_dev_examplesget_train_examplesr   r%   r'   r@   r1   save)selfrD   rI   rJ   rF   rG   rK   rL   version_tagcached_features_file	lock_pathstarts               r   __init__zSquadDataset.__init__w   sq    	%:"/3/Kc)+++QaQcQcdC   	AAT{ A A A?@@@A	":Ddd!w||".IIDMedjee9#6#?ee$BUeeXcee 
  
 )72	i   ,	 ,	w~~233 +D<P +	$)J/C$D$D! !% 1* =#044YEE $ 1 5 5j$ G G]9M]]]_c_h_j_jmr_r   <'4=+@NN&0D & & &  
 59$$$(N$C$CDM$R$RDMM$(N$E$Edm$T$TDM.P!]'#'#6#%)%: $ 3 L#1	/ 	/ 	/+t| 	
!%4<UYUbcc(  
 q8LqqUYU^U`U`chUhqqqq  U,	 ,	 ,	 ,	 ,	 ,	 ,	 ,	 ,	 ,	 ,	 ,	 ,	 ,	 ,	 ,	 ,	 ,	s   A A5'HLLLc                 *    t          | j                  S r   )lenrE   )ri   s    r   __len__zSquadDataset.__len__   s    4=!!!r   returnc                 &   | j         |         }t          j        |j        t          j                  }t          j        |j        t          j                  }t          j        |j        t          j                  }t          j        |j        t          j                  }t          j        |j        t          j	                  }t          j        |j
        t          j	                  }|||d}	| j        j        dv r|	d= | j        j        dv r|	                    ||d           | j        j        r|	                    d|i           | j        rG|	                    dt          j        |j        t          j                  | j        j        z  i           | j        t*          j        k    rbt          j        |j        t          j                  }
t          j        |j        t          j                  }|	                    |
|d	           |	S )
N)dtype)	input_idsattention_masktoken_type_ids)xlmroberta
distilbert	camembertrw   )xlnetrx   )	cls_indexp_maskis_impossiblelangs)start_positionsend_positions)rE   r_   tensorru   longrv   rw   r}   r~   r<   r   rD   r   updater+   rG   onesshapeint64r/   rF   r?   r@   start_positionend_position)ri   ifeatureru   rv   rw   r}   r~   r   inputsr   r   s               r   __getitem__zSquadDataset.__getitem__   s   -"L!2%*EEE	g&<EJOOOg&<EJOOOL!2%*EEE	gnEK@@@W%:%+NNN #,,
 
 9#PPP'(9#333MM	VDDEEEy0 @>???) owIO5;)W)W)WZ^ZcZk)kmnnn9###l7+ATTTO!L)=UZPPPMMMoP]^^___r   )r2   r3   r4   r5   r   r9   r   r   r?   r;   r@   r   r   r:   r	   r8   rn   rq   r   r_   Tensorr   r=   r   r   rC   rC   m   s           %$$$=!!!!
KKK '+"'+05#'(,I I(I 'I sm	I
 CJI  (~I C=I !I I I IV" " " S%,%6 7            r   rC   )'rY   r^   dataclassesr   r   enumr   typingr   r   r   r	   r_   filelockr
   torch.utils.datar   models.auto.modeling_autor   tokenization_utilsr   utilsr   processors.squadr   r   r   r   
get_loggerr2   rc   listkeysMODEL_CONFIG_CLASSEStupler7   r   r?   rC   r=   r   r   <module>r      s   
			  ( ( ( ( ( ( ( (       . . . . . . . . . . . .        $ $ $ $ $ $ M M M M M M 5 5 5 5 5 5       t t t t t t t t t t t t 
	H	%	%tE@EGGHH eEE0DEEEEE ?n ?n ?n ?n ?n ?n ?n ?nD    D   
x x x x x7 x x x x xr   