
    g6                        d dl Z d dlZd dlZd dlmZ d dlmZmZmZ ddlm	Z	m
Z
mZ  ej        e          Ze G d d                      Z ed	           G d
 d                      Z G d d          Z G d de          ZdS )    N)	dataclass)ListOptionalUnion   )is_tf_availableis_torch_availableloggingc                   b    e Zd ZU dZeed<   eed<   dZee         ed<   dZee         ed<   d Z	dS )InputExamplea5  
    A single training/test example for simple sequence classification.

    Args:
        guid: Unique id for the example.
        text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
        text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
        label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
    guidtext_aNtext_blabelc                 X    t          j        t          j        |           d          dz   S )*Serializes this instance to a JSON string.   )indent
jsondumpsdataclassesasdictselfs    ^/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/data/processors/utils.pyto_json_stringzInputExample.to_json_string1   s'    z+,T221===DD    )
__name__
__module____qualname____doc__str__annotations__r   r   r   r    r   r   r   r      sp           IIIKKK FHSM   E8C=E E E E Er   r   T)frozenc                       e Zd ZU dZee         ed<   dZeee                  ed<   dZ	eee                  ed<   dZ
eeeef                  ed<   d ZdS )InputFeaturesa  
    A single set of features of data. Property names are the same names as the corresponding inputs to a model.

    Args:
        input_ids: Indices of input sequence tokens in the vocabulary.
        attention_mask: Mask to avoid performing attention on padding token indices.
            Mask values selected in `[0, 1]`: Usually `1` for tokens that are NOT MASKED, `0` for MASKED (padded)
            tokens.
        token_type_ids: (Optional) Segment token indices to indicate first and second
            portions of the inputs. Only some models use them.
        label: (Optional) Label corresponding to the input. Int for classification problems,
            float for regression problems.
    	input_idsNattention_masktoken_type_idsr   c                 T    t          j        t          j        |                     dz   S )r   r   r   r   s    r   r   zInputFeatures.to_json_stringK   s"    z+,T2233d::r   )r    r!   r"   r#   r   intr%   r+   r   r,   r   r   floatr   r&   r   r   r)   r)   6   s           Cy*.NHT#Y'...*.NHT#Y'...)-E8E#u*%&---; ; ; ; ;r   r)   c                   N    e Zd ZdZd Zd Zd Zd Zd Zd Z	e
d
d	            ZdS )DataProcessorzEBase class for data converters for sequence classification data sets.c                     t                      )z
        Gets an example from a dict with tensorflow tensors.

        Args:
            tensor_dict: Keys and values should match the corresponding Glue
                tensorflow_dataset examples.
        NotImplementedError)r   tensor_dicts     r   get_example_from_tensor_dictz*DataProcessor.get_example_from_tensor_dictS   s     "###r   c                     t                      )z8Gets a collection of [`InputExample`] for the train set.r3   r   data_dirs     r   get_train_examplesz DataProcessor.get_train_examples]       !###r   c                     t                      )z6Gets a collection of [`InputExample`] for the dev set.r3   r8   s     r   get_dev_exampleszDataProcessor.get_dev_examplesa   r;   r   c                     t                      )z7Gets a collection of [`InputExample`] for the test set.r3   r8   s     r   get_test_exampleszDataProcessor.get_test_examplese   r;   r   c                     t                      )z*Gets the list of labels for this data set.r3   r   s    r   
get_labelszDataProcessor.get_labelsi   r;   r   c                     t          |                                           dk    r1|                                 t          |j                           |_        |S )z
        Some tensorflow_datasets datasets are not formatted the same way the GLUE datasets are. This method converts
        examples to the correct format.
           )lenrA   r.   r   )r   examples     r   tfds_mapzDataProcessor.tfds_mapm   sF    
 t  !!A%% OO--c'-.@.@AGMr   Nc                     t          |dd          5 }t          t          j        |d|                    cddd           S # 1 swxY w Y   dS )z!Reads a tab separated value file.rz	utf-8-sig)encoding	)	delimiter	quotecharN)openlistcsvreader)cls
input_filerL   fs       r   	_read_tsvzDataProcessor._read_tsvv   s     *cK888 	LA
1	JJJKK	L 	L 	L 	L 	L 	L 	L 	L 	L 	L 	L 	L 	L 	L 	L 	L 	L 	Ls   $AAAN)r    r!   r"   r#   r6   r:   r=   r?   rA   rF   classmethodrT   r&   r   r   r1   r1   P   s        OO$ $ $$ $ $$ $ $$ $ $$ $ $   L L L [L L Lr   r1   c                       e Zd ZdZddZd Zd Ze	 dd            Zedd            Z		 	 	 	 	 	 	 ddZ
	 ddZ	 	 	 	 	 ddZdS )%SingleSentenceClassificationProcessorz@Generic processor for a single sentence classification data set.NclassificationFc                 N    |g n|| _         |g n|| _        || _        || _        d S rU   )labelsexamplesmodeverbose)r   r[   r\   r]   r^   s        r   __init__z.SingleSentenceClassificationProcessor.__init__   s3    "Nbb&.H	r   c                 *    t          | j                  S rU   )rD   r\   r   s    r   __len__z-SingleSentenceClassificationProcessor.__len__   s    4=!!!r   c                     t          |t                    r!t          | j        | j        |                   S | j        |         S )N)r[   r\   )
isinstanceslicerX   r[   r\   )r   idxs     r   __getitem__z1SingleSentenceClassificationProcessor.__getitem__   sA    c5!! 	j8VZVcdgVhiiii}S!!r    r   rC   c           
      P     | di |}|                     ||||||dd           |S )NT)
split_namecolumn_labelcolumn_text	column_idskip_first_rowoverwrite_labelsoverwrite_examplesr&   )add_examples_from_csv)	rQ   	file_nameri   rj   rk   rl   rm   kwargs	processors	            r   create_from_csvz5SingleSentenceClassificationProcessor.create_from_csv   sR     CMM&MM	''!%#)!# 	( 		
 		
 		
 r   c                 D     | di |}|                     ||           |S )N)r[   r&   )add_examples)rQ   texts_or_text_and_labelsr[   rr   rs   s        r   create_from_examplesz:SingleSentenceClassificationProcessor.create_from_examples   s3    CMM&MM	7GGGr   c	                    |                      |          }	|r
|	dd          }	g }
g }g }t          |	          D ]\  }}|
                    ||                    |                    ||                    ||                    ||                    Y|r| d| nt          |          }|                    |           |                     |
||||          S )NrC   -)rn   ro   )rT   	enumerateappendr$   rv   )r   rq   ri   rj   rk   rl   rm   rn   ro   linestextsr[   idsiliner   s                   r   rp   z;SingleSentenceClassificationProcessor.add_examples_from_csv   s    y)) 	!""IE '' 	! 	!GAtLLk*+++MM$|,---$

4	?++++.8D***q***c!ff

4      631AVh ! 
 
 	
r   c           	         |Ot          |          t          |          k    r/t          dt          |           dt          |                     |Ot          |          t          |          k    r/t          dt          |           dt          |                     |d gt          |          z  }|d gt          |          z  }g }t                      }t          |||          D ]g\  }}	}
t	          |t
          t          f          r|	|\  }}	n|}|                    |	           |                    t          |
|d |	                     h|r|| _
        n| j
                            |           |rt          |          | _        n9t          t          | j                                      |                    | _        | j
        S )Nz(Text and labels have mismatched lengths z and z%Text and ids have mismatched lengths )r   r   r   r   )rD   
ValueErrorsetziprc   tuplerN   addr|   r   r\   extendr[   union)r   rw   r[   r   rn   ro   r\   added_labelstext_or_text_and_labelr   r   texts               r   rv   z2SingleSentenceClassificationProcessor.add_examples   s    #&>"?"?3v;;"N"Nl3?W;X;Xll_bci_j_jll   ?s#;<<CHHsSIaEbEbssilmpiqiqssttt;&37888C>Vc":;;;Fuu367OQWY\3]3] 	\ 	\/"E405$-@@ .U]4ee-U###OOLd4TYZZZ[[[[  	+$DMMM  ***  	E|,,DKKs4;//55lCCDDDK}r   Tc           	      	   ||j         }d t          | j                  D             }g }t          | j                  D ]p\  }	}
|	dz  dk    rt                              d|	            |                    |
j        dt          ||j                             }|	                    |           qt          d |D                       }g t          t          || j                            D ][\  }	\  }}
|	dz  dk    r2t                              d	|	 d
t          | j                              |rdndgt          |          z  }|t          |          z
  }|r|g|z  |z   }|rdndg|z  |z   }n||g|z  z   }||rdndg|z  z   }t          |          |k    r"t          dt          |           d|           t          |          |k    r"t          dt          |           d|           | j        dk    r||
j                 }n4| j        dk    rt!          |
j                  }nt          | j                  |	dk     r| j        rt                              d           t                              d|
j                    t                              dd                    d |D                                   t                              dd                    d |D                                   t                              d|
j         d| d           	                    t)          |||                     ]|S |dk    rt+                      st-          d          ddl}fd}|j        j                            ||j        |j        d|j        f|                    dg          |                    dg          d|                    g           f          }|S |d k    rt=                      st-          d!          ddl}dd"l m!} |"                    d# D             |j#        $          }|"                    d% D             |j#        $          }| j        dk    r'|"                    d& D             |j#        $          }n1| j        dk    r&|"                    d' D             |j        $          } ||||          }|S t          d(          ))a  
        Convert examples in a list of `InputFeatures`

        Args:
            tokenizer: Instance of a tokenizer that will tokenize the examples
            max_length: Maximum example length
            pad_on_left: If set to `True`, the examples will be padded on the left rather than on the right (default)
            pad_token: Padding token
            mask_padding_with_zero: If set to `True`, the attention mask will be filled by `1` for actual values
                and by `0` for padded values. If set to `False`, inverts it (`1` for padded values, `0` for actual
                values)

        Returns:
            If the `examples` input is a `tf.data.Dataset`, will return a `tf.data.Dataset` containing the
            task-specific features. If the input is a list of `InputExamples`, will return a list of task-specific
            `InputFeatures` which can be fed to the model.

        Nc                     i | ]\  }}||	S r&   r&   ).0r   r   s      r   
<dictcomp>zFSingleSentenceClassificationProcessor.get_features.<locals>.<dictcomp>  s    EEE(!UUAEEEr   i'  r   zTokenizing example T)add_special_tokens
max_lengthc              3   4   K   | ]}t          |          V  d S rU   )rD   )r   r*   s     r   	<genexpr>zESingleSentenceClassificationProcessor.get_features.<locals>.<genexpr>  s(      IIi3y>>IIIIIIr   zWriting example /rC   zError with input length z vs rY   
regression   z*** Example ***zguid: zinput_ids:  c                 ,    g | ]}t          |          S r&   r$   r   xs     r   
<listcomp>zFSingleSentenceClassificationProcessor.get_features.<locals>.<listcomp>6  s    3N3N3NqCFF3N3N3Nr   zattention_mask: c                 ,    g | ]}t          |          S r&   r   r   s     r   r   zFSingleSentenceClassificationProcessor.get_features.<locals>.<listcomp>7  s    8X8X8XAQ8X8X8Xr   zlabel: z (id = )r*   r+   r   tfz?return_tensors set to 'tf' but TensorFlow 2.0 can't be importedc               3   F   K   D ]} | j         | j        d| j        fV  d S )Nr*   r+   r   )exfeaturess    r   genz?SingleSentenceClassificationProcessor.get_features.<locals>.genC  sI      " g gB)+IZ[[]_]efffffg gr   r   ptz8return_tensors set to 'pt' but PyTorch can't be imported)TensorDatasetc                     g | ]	}|j         
S r&   )r*   r   rS   s     r   r   zFSingleSentenceClassificationProcessor.get_features.<locals>.<listcomp>S  s    )H)H)H!!+)H)H)Hr   )dtypec                     g | ]	}|j         
S r&   )r+   r   s     r   r   zFSingleSentenceClassificationProcessor.get_features.<locals>.<listcomp>T  s    .R.R.RAq/?.R.R.Rr   c                     g | ]	}|j         
S r&   r   r   s     r   r   zFSingleSentenceClassificationProcessor.get_features.<locals>.<listcomp>V      *E*E*Eq17*E*E*Er   c                     g | ]	}|j         
S r&   r   r   s     r   r   zFSingleSentenceClassificationProcessor.get_features.<locals>.<listcomp>X  r   r   z,return_tensors should be one of 'tf' or 'pt')$max_lenr{   r[   r\   loggerinfoencoder   minr|   maxr   rD   r   r]   r   r/   r^   r   joinr)   r   RuntimeError
tensorflowdataDatasetfrom_generatorint32int64TensorShaper	   torchtorch.utils.datar   tensorlong)r   	tokenizerr   pad_on_left	pad_tokenmask_padding_with_zeroreturn_tensors	label_mapall_input_idsex_indexrE   r*   batch_lengthr+   padding_lengthr   r   r   datasetr   r   all_attention_mask
all_labelsr   s                          @r   get_featuresz2SingleSentenceClassificationProcessor.get_features   s   6 "*JEEi.D.DEEE	!*4=!9!9 		, 		,Hg%1$$<(<<===!((#'z9+<== )  I
   ++++II=IIIII.7M4=8Y8Y.Z.Z #	l #	l*H*y'%1$$NxNN#dm:L:LNNOOO $:@aaqAC	NNRN *C	NN:N j'[>9YF	(>#E11A"F"W[i!i%)~)EF	!/9O4VAAUV3WZh3h!i9~~-- !^C	NN!^!^P\!^!^___>""l22 !cC<O<O!c!cUa!c!cdddy,,,!'-0l**gm,, +++!|||-...3W\33444Q#((3N3NI3N3N3N*O*OQQRRR[sxx8X8X8X8X8X/Y/Y[[\\\DgmDDEDDDEEEOOMIndijjjkkkk!Ot##"$$ f"#deee####g g g g g go44!x28DDbhO!~~tf55Y]X^I_I_``bdbpbpqsbtbtu G
 Nt##%'' _"#]^^^LLL666666!LL)H)Hx)H)H)HPUPZL[[M!&.R.R.R.R.RZ_Zd!e!ey,,,"\\*E*EH*E*E*EUZ\XX

l**"\\*E*EH*E*E*EU[\YY
#mM3EzRRGNKLLLr   )NNrY   F)rg   r   rC   NFrU   )rg   r   rC   NFFF)NNFF)NFr   TN)r    r!   r"   r#   r_   ra   rf   rV   rt   rx   rp   rv   r   r&   r   r   rX   rX   }   s       JJ   " " "" " "
 ej   [     [  
 
 
 
> kp# # # #P #uM uM uM uM uM uMr   rX   )rO   r   r   r   typingr   r   r   utilsr   r	   r
   
get_loggerr    r   r   r)   r1   rX   r&   r   r   <module>r      s~  " 


      ! ! ! ! ! ! ( ( ( ( ( ( ( ( ( ( A A A A A A A A A A 
	H	%	% E E E E E E E E0 $; ; ; ; ; ; ; ;2*L *L *L *L *L *L *L *LZ`M `M `M `M `MM `M `M `M `M `Mr   