
    g\                     2   d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZmZm	Z	 d dl
Z
d dlmZ d dlmZ ddlmZ ddlmZ  ej        e          ZdZ G d	 d
e          Z G d de          Z G d de          Z G d de          Z G d de          ZdS )    N)DictListOptional)FileLock)Dataset   )PreTrainedTokenizer)loggingu   This dataset will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets library. You can have a look at this example script for pointers: {0}c            
       X    e Zd ZdZ	 	 ddedededee         fdZd	 Z	d
e
j        fdZdS )TextDatasetH
    This will be superseded by a framework-agnostic approach soon.
    FN	tokenizer	file_path
block_size	cache_dirc           
      b   t          j        t                              d          t                     t
          j                            |          du rt          d| d          ||	                    d          z
  }t
          j        
                    |          \  }}t
          j                            ||n|d|j        j         d| d|           }|dz   }	t          |	          5  t
          j                            |          r|st!          j                    }
t#          |d	          5 }t%          j        |          | _        d d d            n# 1 swxY w Y   t*                              d
| dt!          j                    |
z
             nt*                              d|            g | _        t#          |d          5 }|                                }d d d            n# 1 swxY w Y   |                    |                    |                    }t5          dt7          |          |z
  dz   |          D ]:}| j                            |                    ||||z                                 ;t!          j                    }
t#          |d          5 }t%          j        | j        |t$          j                   d d d            n# 1 swxY w Y   t*                              d| dt!          j                    |
z
  dd           d d d            d S # 1 swxY w Y   d S )Nchttps://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.pyFInput file path 
 not foundpair
cached_lm__.lockrb"Loading features from cached file  [took %.3f s]'Creating features from dataset file at utf-8encodingr      wbprotocol!Saving features into cached file  [took .3f s]) warningswarnDEPRECATION_WARNINGformatFutureWarningospathisfile
ValueErrornum_special_tokens_to_addsplitjoin	__class____name__r   existstimeopenpickleloadexamplesloggerinforeadconvert_tokens_to_idstokenizerangelenappend build_inputs_with_special_tokensdumpHIGHEST_PROTOCOL)selfr   r   r   overwrite_cacher   	directoryfilenamecached_features_file	lock_pathstarthandleftexttokenized_textis                   h/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/data/datasets/language_modeling.py__init__zTextDataset.__init__-   s:    	&&u  		
 	
 	
 7>>)$$--E	EEEFFF)"E"E5"E"Q"QQ
 gmmI66	8!w||".IIIO,5OO
OOXOO 
  
 )72	i   	 	w~~233 O 	.55 8$*K$7$7DM8 8 8 8 8 8 8 8 8 8 8 8 8 8 8]9M]]]_c_h_j_jmr_r   
 QiQQRRR ")g666 $!6688D$ $ $ $ $ $ $ $ $ $ $ $ $ $ $ "+!@!@ASASTXAYAY!Z!Zq#n"5"5
"BQ"F
SS  AM((!BB>RSVWZdVdRdCeff    	.55 YKv@WXXXXY Y Y Y Y Y Y Y Y Y Y Y Y Y Yq8LqqUYU^U`U`chUhqqqq  ;	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   -AL$2EL$E	L$E	 A-L$G."L$.G2	2L$5G2	6B-L$#'K
L$K	L$K	9L$$L(+L(c                 *    t          | j                  S NrD   r=   rI   s    rU   __len__zTextDataset.__len__j       4=!!!    returnc                 X    t          j        | j        |         t           j                  S )Ndtype)torchtensorr=   longrI   rT   s     rU   __getitem__zTextDataset.__getitem__m   s     |DM!,EJ????r]   )FN)r7   
__module____qualname____doc__r	   strintr   rV   r[   rb   Tensorrf    r]   rU   r   r   (   s          #'; ;&; ; 	; C=; ; ; ;z" " "@ @ @ @ @ @ @r]   r   c                   R    e Zd ZdZdededefdZd Zde	ee
j        f         fdZd	S )
LineByLineTextDatasetr   r   r   r   c                 "   t          j        t                              d          t                     t
          j                            |          du rt          d| d          t          
                    d|            t          |d          5 }d |                                                                D             }d d d            n# 1 swxY w Y    ||d	d	|
          }|d         | _        d | j        D             | _        d S )Nr   Fr   r   r   r   r    c                 `    g | ]+}t          |          d k    |                                )|,S r   )rD   isspace.0lines     rU   
<listcomp>z2LineByLineTextDataset.__init__.<locals>.<listcomp>   s4    fffdD		AVZVbVbVdVdTr]   Tadd_special_tokens
truncation
max_length	input_idsc                 R    g | ]$}d t          j        |t           j                  i%S r|   r`   rb   rc   rd   ru   es     rU   rw   z2LineByLineTextDataset.__init__.<locals>.<listcomp>   .    aaaa+u|AUZ'H'H'HIaaar]   )r*   r+   r,   r-   r.   r/   r0   r1   r2   r>   r?   r:   r@   
splitlinesr=   )rI   r   r   r   rQ   linesbatch_encodings          rU   rV   zLineByLineTextDataset.__init__v   sc   &&u  		
 	
 	
 7>>)$$--E	EEEFFF 	IiIIJJJ)g... 	g!ffaffhh&9&9&;&;fffE	g 	g 	g 	g 	g 	g 	g 	g 	g 	g 	g 	g 	g 	g 	g #5Td_ijjj&{3aaSWS`aaas   1CCCc                 *    t          | j                  S rX   rY   rZ   s    rU   r[   zLineByLineTextDataset.__len__   r\   r]   r^   c                     | j         |         S rX   r=   re   s     rU   rf   z!LineByLineTextDataset.__getitem__       }Qr]   Nr7   rg   rh   ri   r	   rj   rk   rV   r[   r   rb   rc   rf   rm   r]   rU   ro   ro   q   s         b"5 b# bSV b b b b*" " " S%,%6 7            r]   ro   c                   V    e Zd ZdZdedededefdZd Zde	ee
j        f         fd	Zd
S )LineByLineWithRefDatasetr   r   r   r   ref_pathc                    t          j        t                              d          t                     t
          j                            |          du rt          d| d          t
          j                            |          du rt          d| d          t          
                    d|            t          
                    d|            t          |d	          5 }|                                }d d d            n# 1 swxY w Y   d
 |D             }t          |d	          5 }d |                                                                D             }d d d            n# 1 swxY w Y   t          |          t          |          k    r5t          d| dt          |           d| dt          |                      ||dd|          }|d         | _        d | j        D             | _        t          | j                  }	t#          |	          D ]6}
t%          j        ||
         t$          j                  | j        |
         d<   7d S )Nzghttps://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm_wwm.pyFr   r   zRef file path r   zUse ref segment results at r   r    c                     g | ]=}t          |          d k    |                                )|                                >S rr   )rD   rs   striprt   s     rU   rw   z5LineByLineWithRefDataset.__init__.<locals>.<listcomp>   s7    VVVTQt||~~

r]   c                     g | ]=}t          |          d k    |                                )t          j        |          >S rr   )rD   rs   jsonloadsrt   s     rU   rw   z5LineByLineWithRefDataset.__init__.<locals>.<listcomp>   s?    ppp#d))VW--`d`l`l`n`n-4:d##---r]   zDLength of Input file should be equal to Ref file. But the length of z is z while length of Trx   r|   c                 R    g | ]$}d t          j        |t           j                  i%S r~   r   r   s     rU   rw   z5LineByLineWithRefDataset.__init__.<locals>.<listcomp>   r   r]   r`   chinese_ref)r*   r+   r,   r-   r.   r/   r0   r1   r2   r>   r?   r:   	readlinesr@   r   rD   r=   rC   rb   rc   rd   )rI   r   r   r   r   rQ   datarefr   nrT   s              rU   rV   z!LineByLineWithRefDataset.__init__   s   &&y  		
 	
 	
 7>>)$$--E	EEEFFF7>>(##u,,CiCCCDDD 	IiIIJJJ<(<<===)g... 	!!;;==D	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	!VVVVV(W--- 	qpp0C0C0E0EpppC	q 	q 	q 	q 	q 	q 	q 	q 	q 	q 	q 	q 	q 	q 	qt99C  <W` < <fijnfofo < <#+< <14S< <  
 #4DT^hiii&{3aaSWS`aaaq 	U 	UA.3l3q6.T.T.TDM!]++	U 	Us$   &DDD/1E,,E03E0c                 *    t          | j                  S rX   rY   rZ   s    rU   r[   z LineByLineWithRefDataset.__len__   r\   r]   r^   c                     | j         |         S rX   r   re   s     rU   rf   z$LineByLineWithRefDataset.__getitem__   r   r]   Nr   rm   r]   rU   r   r      s         "U"5 "U# "USV "Ube "U "U "U "UH" " " S%,%6 7            r]   r   c                   Z    e Zd ZdZdededefdZddZd Z	d	e
eej        f         fd
ZdS )LineByLineWithSOPTextDatasetzY
    Dataset for sentence order prediction task, prepare sentence pairs for SOP task
    r   file_dirr   c                    t          j        t                              d          t                     t
          j                            |          du rt          | d          t          
                    d|            g | _        t          j        |          D ]}t
          j                            ||          }t
          j                            |          du rt          | d          d}t          |d          5 }|                                }g }	|D ]p}
d|
v rd	}	d
|
v rLd}fd|	dd          D             }|                     ||          }| j                            |           g }	Y|r|	                    |
           q	 d d d            n# 1 swxY w Y   t          
                    d           d S )Nr   Fz is not a directoryz.Creating features from dataset file folder at z is not a filer   r    z<doc id=Tz</doc>c                     g | ]Q}t          |          d k    |                                )                                        |                    RS rr   )rD   rs   rA   rB   )ru   rv   r   s     rU   rw   z9LineByLineWithSOPTextDataset.__init__.<locals>.<listcomp>   sX     $ $ $ $ #D		Adllnn &;;I<N<Nt<T<TUU -r]   r"   zDataset parse finished.)r*   r+   r,   r-   r.   r/   r0   isdirr2   r>   r?   r=   listdirr5   r1   r:   r   create_examples_from_documentextendrE   )rI   r   r   r   	file_namer   article_openrQ   original_linesarticle_linesrv   documentr=   s    `           rU   rV   z%LineByLineWithSOPTextDataset.__init__   s:   &&u  		
 	
 	
 7==""e++===>>>OXOOPPP H-- 	7 	7IXy99Iw~~i((E11 I!=!=!=>>> Li'222 7a!" "* 7 7D!T))'+!T))',$ $ $ $(5abb(9$ $ $ $(#E#EhPZ\e#f#f,,X666(*' 7)00666!77 7 7 7 7 7 7 7 7 7 7 7 7 7 7* 	-.....s   B
FF#	&F#	皙?c                    ||                     d          z
  }|}t          j                    |k     rt          j        d|          }g }g }d}	d}
|
t          |          k     r}||
         }|s|
dz  }
$|                    |           |	t          |          z  }	|
t          |          dz
  k    s|	|k    r|r	d}t          |          dk    r%t          j        dt          |          dz
            }g }t          |          D ]}|                    ||                    g }t          |t          |                    D ]}|                    ||                    t          |          dk    st          |          dk    r9t          j                    dk     rd}||}}nd}d } ||||           t          |          dk    s t          d	t          |           d
          t          |          dk    s t          dt          |           d
          |                    ||          }|	                    ||          }t          j        |t          j                  t          j        |t          j                  t          j        |rdndt          j                  d}|                    |           g }d}	|
dz  }
|
t          |          k     }|S )'Creates examples for a single document.Tr      r   r"         ?Fc                 >   	 t          |           t          |          z   }||k    rdS t          |           t          |          k    r| n|}t          |          dk    st          d          t          j                    dk     r|d= n|                                 )z;Truncates a pair of sequences to a maximum sequence length.Tr"   z8Sequence length to be truncated must be no less than oner   r   N)rD   r2   randompop)tokens_atokens_bmax_num_tokenstotal_lengthtrunc_tokenss        rU   truncate_seq_pairzULineByLineWithSOPTextDataset.create_examples_from_document.<locals>.truncate_seq_pair-  s    3+.x==3x==+HL+~== %7:8}}s8}}7T7T88ZbL$'$5$5$:$:&01k&l&l l  &}44$0OO , 0 0 2 2 23r]   Length of sequence a is  which must be no less than 1Length of sequence b is r`   )r|   token_type_idssentence_order_label)r3   r   randintrD   rE   rC   r   r2   rF   $create_token_type_ids_from_sequencesrb   rc   rd   )rI   r   r   r   short_seq_probr   target_seq_lengthr=   current_chunkcurrent_lengthrT   segmenta_endr   jr   is_nextr   r|   r   examples                        rU   r   z:LineByLineWithSOPTextDataset.create_examples_from_document   s"    $i&I&It&I&T&TT +=??^++ &q. A A #h--qkG Q  )))c'll*NCMMA%%%;L)L)L  :-E=))Q.. &q#m2D2Dq2H I I!H"5\\ : : a(89999  "H"5#m*<*<== : : a(899998}}))S]]a-?-?  },,"'-5x("&3 3 3  &%h.IIIMMQ..()pCMM)p)p)pqqqMMQ..()pCMM)p)p)pqqq !* J J8U] ^ ^I%.%S%ST\^f%g%gN &+\)5:%N%N%N*/,~UZ*X*X*X05'=PQQqX]Xb0c0c0c G
 OOG,,, "!"FAM #h--N r]   c                 *    t          | j                  S rX   rY   rZ   s    rU   r[   z$LineByLineWithSOPTextDataset.__len__S  r\   r]   r^   c                     | j         |         S rX   r   re   s     rU   rf   z(LineByLineWithSOPTextDataset.__getitem__V  r   r]   N)r   )r7   rg   rh   ri   r	   rj   rk   rV   r   r[   r   rb   rc   rf   rm   r]   rU   r   r      s         '/"5 '/ '/RU '/ '/ '/ '/Ra a a aF" " " S%,%6 7            r]   r   c                   f    e Zd ZdZ	 	 	 ddededefdZd	eee                  d
edefdZ	d Z
d ZdS )$TextDatasetForNextSentencePredictionr   Fr   r   r   r   r   c           	      R   t          j        t                              d          t                     t
          j                            |          st          d| d          || _	        || _
        t
          j                            |          \  }}t
          j                            |d|j        j         d| d|           }	|| _        |	dz   }
t!          |
          5  t
          j                            |	          r|st%          j                    }t'          |	d          5 }t)          j        |          | _        d d d            n# 1 swxY w Y   t.                              d|	 d	t%          j                    |z
             nt.                              d
|            g g| _        t'          |d          5 }	 |                                }|sn|                                }|s8t9          | j        d                   dk    r| j                            g            |                    |          }|                    |          }|r | j        d                             |           	 d d d            n# 1 swxY w Y   t.                              dt9          | j                   d           g | _        tA          | j                  D ]\  }}| !                    |||           t%          j                    }t'          |	d          5 }t)          j"        | j        |t(          j#                   d d d            n# 1 swxY w Y   t.                              d|	 dt%          j                    |z
  dd           d d d            d S # 1 swxY w Y   d S )Nr   r   r   cached_nsp_r   r   r   r   r   r   r   r    Tr   zCreating examples from z documents.r#   r$   r&   r'   r(   r)   )$r*   r+   r,   r-   r.   r/   r0   r1   r2   short_seq_probabilitynsp_probabilityr4   r5   r6   r7   r   r   r8   r9   r:   r;   r<   r=   r>   r?   	documentsreadliner   rD   rE   rB   rA   	enumerater   rG   rH   )rI   r   r   r   rJ   r   r   rK   rL   rM   rN   rO   rP   rQ   rv   tokens	doc_indexr   s                     rU   rV   z-TextDatasetForNextSentencePrediction.__init___  s    	&&u  		
 	
 	
 w~~i(( 	GE	EEEFFF%:". gmmI66	8!w||P)-6PPPPhPP 
  

 # )72	 i   %	 %	w~~233 $O $	.55 8$*K$7$7DM8 8 8 8 8 8 8 8 8 8 8 8 8 8 8]9M]]]_c_h_j_jmr_r    QiQQRRR"$)g666 >!> zz||# "!#zz||  $ 6DN2,>(?(?1(D(D N11"555!*!3!3D!9!9!*!@!@!H!H! > N2.55f===> "	> > > > > > > > > > > > > > > Vc$.6I6IVVVWWW "+4T^+D+D X X'Ix66xJWWWW	.55 YKv@WXXXXY Y Y Y Y Y Y Y Y Y Y Y Y Y Yq8LqqUYU^U`U`chUhqqqq  G%	 %	 %	 %	 %	 %	 %	 %	 %	 %	 %	 %	 %	 %	 %	 %	 %	 %	s   #AN(ENE	NE	A.NB4J9NJ			NJ		BN'MNM	NM	9NN #N r   r   c                 ~   || j                             d          z
  }|}t          j                    | j        k     rt          j        d|          }g }d}d}|t          |          k     rQ||         }	|                    |	           |t          |	          z  }|t          |          dz
  k    s||k    r|rd}
t          |          dk    r%t          j        dt          |          dz
            }
g }t          |
          D ]}|                    ||                    g }t          |          dk    st          j                    | j	        k     rd}|t          |          z
  }t          d          D ]4}t          j        dt          | j
                  dz
            }||k    r n5| j
        |         }t          j        dt          |          dz
            }t          |t          |                    D ]2}|                    ||                    t          |          |k    r n3t          |          |
z
  }||z  }n=d}t          |
t          |                    D ]}|                    ||                    t          |          dk    s t          dt          |           d	          t          |          dk    s t          d
t          |           d	          | j                             ||          }| j                             ||          }t          j        |t          j                  t          j        |t          j                  t          j        |rdndt          j                  d}| j                            |           g }d}|dz  }|t          |          k     OdS dS )r   Tr   r   r   r"   
   Fr   r   r   r`   )r|   r   next_sentence_labelN)r   r3   r   r   r   rD   rE   rC   r   r   r   r2   rF   r   rb   rc   rd   r=   )rI   r   r   r   r   r   r   r   rT   r   r   r   r   r   is_random_nexttarget_b_lengthr   random_document_indexrandom_documentrandom_startnum_unused_segmentsr|   r   r   s                           rU   r   zBTextDatasetForNextSentencePrediction.create_examples_from_document  s    $dn&N&NTX&N&Y&YY +=??T777 &q. A A#h--qkG  )))c'll*NCMMA%%%;L)L)L  :2 E=))Q.. &q#m2D2Dq2H I I!H"5\\ : : a(89999!H=))Q..&-//DDX2X2X)-*;c(mm*K "'r & &A4:N1c$.FYFY\]F]4^4^14	AA %  B +/.9N*O'-~a_9M9MPQ9Q'R'R!&|S5I5I!J!J & &A$OOOA,>???"8}}?? %  @ /2-.@.@5.H+00 */!&uc-.@.@!A!A > >A$OOM!,<====MMQ..()pCMM)p)p)pqqqMMQ..()pCMM)p)p)pqqq !% O OPXZb c cI%)^%X%XYack%l%lN &+\)5:%N%N%N*/,~UZ*X*X*X/4|<VAAUV^c^h/i/i/i G M((111 "!"FAI #h--r]   c                 *    t          | j                  S rX   rY   rZ   s    rU   r[   z,TextDatasetForNextSentencePrediction.__len__  r\   r]   c                     | j         |         S rX   r   re   s     rU   rf   z0TextDatasetForNextSentencePrediction.__getitem__  r   r]   N)Fr   r   )r7   rg   rh   ri   r	   rj   rk   rV   r   r   r[   rf   rm   r]   rU   r   r   Z  s          !S S&S S 	S S S SjXd49o XRU Xcf X X X Xt" " "         r]   r   )r   r/   r;   r   r9   r*   typingr   r   r   rb   filelockr   torch.utils.datar   tokenization_utilsr	   utilsr
   
get_loggerr7   r>   r,   r   ro   r   r   r   rm   r]   rU   <module>r      s    				     ' ' ' ' ' ' ' ' ' '        $ $ $ $ $ $ 5 5 5 5 5 5       
	H	%	%L F@ F@ F@ F@ F@' F@ F@ F@R         G      B-  -  -  -  - w -  -  - `U  U  U  U  U 7 U  U  U px  x  x  x  x 7 x  x  x  x  x r]   