
    g%H                        d Z ddlZddlZddlmZ ddlmZ ddlZddlZddl	m
Z ddlmZ ddlmZmZ ddlmZmZmZmZmZmZmZ dd	lmZ dd
lmZ ddlmZ  ej                      ej         e!          Z"g dZ#ej$        j%        ej$        j&        ej$        j'        ej$        j(        ej$        j)        ej$        j*        ej$        j+        ej$        j,        ej$        j-        ej$        j.        ej$        j/        ej$        j0        edZ1 e2d          Z3e3dgz   Z4e4g dz   Z5d Z6d Z7de8de8de9de8de8f
dZ:e!dk    r ej;                    Z<e<=                    de8dd           e<=                    ddd !           e<=                    d"de8dd#$           e<=                    d%e8d&'           e<=                    d(e8d)'           e<>                                Z? e:e?j@        e?jA        e?jB        e?jC        e?jD                   dS dS )*zConvert ESM checkpoint.    N)Path)TemporaryDirectory)batch_encode_sequences)
esmfold_v1)	EsmConfigEsmFoldConfig)EsmForMaskedLMEsmForSequenceClassificationEsmIntermediateEsmLayer	EsmOutputEsmSelfAttentionEsmSelfOutput)EsmForProteinFolding)EsmTokenizer)logging))protein1\  MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA)protein2?MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLA)protein3zPMKTVRQERLKSI<mask>RILERSKEPVSGAQLAEELS<mask>SRQVIVQDIAYLRSLGYN<mask>VATPRGYVLAGG)protein4zNMKTVRQERLKSI<mask>RILERSKEPVSGAQLAEELS<mask>SRQVIVQDIAYLRSLGYN<mask>VATPRGYVLA)esm1b_t33_650M_UR50Sesm1v_t33_650M_UR90S_1esm1v_t33_650M_UR90S_2esm1v_t33_650M_UR90S_3esm1v_t33_650M_UR90S_4esm1v_t33_650M_UR90S_5esm2_t48_15B_UR50Desm2_t36_3B_UR50Desm2_t33_650M_UR50Desm2_t30_150M_UR50Desm2_t12_35M_UR50Desm2_t6_8M_UR50Dr   ARNDCQEGHILKMFPSTWYVX)z<pad>z<mask>z<cls>z<sep>z<eos>c                     t                      5 } d                    t                    }t          |           dz  }|                    |           t          t          |                    }d d d            n# 1 swxY w Y   d|_        |S )N
	vocab.txt
vocab_filer   )r   joinrestypes_with_extrasr   
write_textr   strpad_token_id)tempdirvocabr+   hf_tokenizers       _/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/esm/convert_esm.pyget_esmfold_tokenizerr5   M   s    			 @		.//']][0
e$$$#s:???	@ @ @ @ @ @ @ @ @ @ @ @ @ @ @
 !"Ls   AA::A>A>c                     |                     |                                           }|j        rt          d|j                   |j        rt          d|j                   d S )NzMissing keys: zUnexpected keys: )load_state_dict
state_dictmissing_keys
ValueErrorunexpected_keys)original_module
our_modulestatuss      r4   transfer_and_check_weightsr?   W   sy    ''(B(B(D(DEEF A?&*=??@@@ GEV-CEEFFFG G    modelpytorch_dump_folder_pathclassification_headpush_to_repo
auth_tokenc           	         |                      d          rt          |                      }nt          |                      \  }}|                                 |                      d          rC|j        j        }|j        j        }|j        j        }	d|z  }
|j        j        }d}d}d}t                      }|j	        
                                D ],\  }}t          ||          r|dk    rt          |||           -|j	        j        
                                D ]6\  }}t          |j        |          r|dk    rt          |j        ||           7|j	        j        j        
                                D ]:\  }}t          |j        j        |          rt          |j        j        ||           ;nt          |d          rN|j        j        }|j        j        }|j        j        }	|j        j        }
|j        j        }|j        rdnd}d	}d}d
}n)|j        }|j        }|j        }	d|z  }
|j        }d}d}d}d
}|r|j        j        }t)          |j                  }|j        }|j        }|r|j        }n|}t1          dCi d|j        j        d|d|d|d|	d|
ddddddddd|d|d|d|d|d|d|}|r'|j        d         j        j        j        d          |_        tA          d!|           |                      d          rtB          }n|rtD          }ntF          } ||          } |                                  |j        j        | j        j$        j%        _        |d	k    r |j&        j        | j        j$        j'        _        |j        r@|j        j        | j        j$        j(        _        |j        j)        | j        j$        j(        _)        |j*        j        | j        j+        j*        _        |j*        j)        | j        j+        j*        _)        tY          |j-                  D ]+}| j        j+        j.        |         }|j        |         }|j/        j0        }|j1        j2        j        j3        j        |j1        j4        j        j3        j        cxk    rD|j1        j5        j        j3        j        cxk    r$tm          j7        |j8        |j8        f          k    sn J |j1        j4        j        |j9        j        _3        |j1        j4        j)        |j9        j)        _3        |j1        j2        j        |j:        j        _3        |j1        j2        j)        |j:        j)        _3        |j1        j5        j        |j;        j        _3        |j1        j5        j)        |j;        j)        _3        ty          |j1        d"d
           |j1        j=        j>        |j?        j>        _3        |j@        j        |j/        jA        _        |j@        j)        |j/        jA        _)        |jB        j        |jA        _        |jB        j)        |jA        _)        |j/        jC        }|jD        j        j        |j1        j        j        j        k    sJ |j1        j        j        |jD        _        |j1        j        j)        |jD        _)        |jE        }|jD        j        j        |jF        j        j        k    sJ |jF        j        |jD        _        |jF        j)        |jD        _)        |jC        }|jD        j        j        |jG        j        j        k    sJ |jG        j        |jD        _        |jG        j)        |jD        _)        -|r|jH        j3        | jH        _3        |jI        j3        | jI        _3        t          |jK        | jK                   t          |jL        | jL                   t          |j        | j                   t          |jM        | jM                   t          |jN        | jN                   t          |jO        | jO                   t          |jP        | jP                   nQ|r|j        j        d         jD        j        | jQ        jD        _        |j        d         jD        j)        | jQ        jD        _)        |j        d         j        j        | jQ        j        _        |j        d         j        j)        | jQ        j        _)        n|jO        jD        j        | jO        jD        _        |jO        jD        j)        | jO        jD        _)        |jO        j(        j        | jO        j(        _        |jO        j(        j)        | jO        j(        _)        |jO        j        | jO        jR        _        |jO        j)        | jO        _)        t          |jS        | j        jS                   |rt          d
d#         }nt          }|rt                      }  | d$ |D             d%dd&          }!t          d' |D                       \  }"}#}$}$}$tm          jW        |!d(         |"k              otm          jW        |!d)         |#k              }%n|X                                }& |&|          \  }'}(})t                      5 }*d*Z                    |j                  }+t          |*          d+z  },|,\                    |+           t          t          |,          ,          } d
d
d
           n# 1 swxY w Y    | d- |D             d%d.          }!tm          jW        |!d(         |)k              }%tA          d/|%rd0nd1           |%st          d2          tm          j`                    5  |r|a                                b                    d3 |D                       }- | a                                |!d(         a                                |!d)         a                                4          }.nz | dCi |!d5di}.|.d6         }.|r/ |jc        j        d         |d                    |)                    }-n5 ||!d(         t          tY          d7                    8          }-|-d6         }-|rktm          jf        tm          jg        |.d9         |-d9         z
                      h                                }/tm          ji        |.d9         |-d9         d:          }%nRtm          jf        tm          jg        |.|-z
                      h                                }/tm          ji        |.|-d:          }%tA          d;|/            tA          d<|%rd0nd1           |%st          d=          |s| j                    |!d(         |!d)                   }.|j                    |!d(                   }-tm          jf        tm          jg        |.|-z
                      h                                }/tm          ji        |.|-d:          }%tA          d>           tA          d;|/            tA          d<|%rd0nd1           |%st          d=          t          j[        |          l                    dd?           tA          d@|            | m                    |           ~d
d
d
           n# 1 swxY w Y   tA          dA|            | m                    |           |r0| n                    ||B           | n                    ||B           d
S d
S )Dz?
    Copy/paste/tweak esm's weights to our BERT structure.
    esmfold   FrotaryTtrunkstructure_moduleargsabsoluteN
vocab_sizemask_token_idhidden_sizenum_hidden_layersnum_attention_headsintermediate_sizemax_position_embeddingsi  layer_norm_epsgh㈵>attention_probs_dropout_probg        hidden_dropout_probr0   emb_layer_norm_beforetoken_dropoutposition_embedding_typeis_folding_modelesmfold_config
vocab_listmnlir   zOur ESM config:rot_emb   c                     g | ]
}|d          S     .0rows     r4   
<listcomp>z5convert_esm_checkpoint_to_pytorch.<locals>.<listcomp>+  s    +++SV+++r@   pt)return_tensorspaddingadd_special_tokensc                     g | ]
}|d          S rb   rd   re   s     r4   rh   z5convert_esm_checkpoint_to_pytorch.<locals>.<listcomp>-  s    FeFeFeRUs1vFeFeFer@   	input_idsattention_maskr(   r)   r*   c                     g | ]
}|d          S rb   rd   re   s     r4   rh   z5convert_esm_checkpoint_to_pytorch.<locals>.<listcomp><  s    !@!@!@S#a&!@!@!@r@   )rj   rk   z1Do both models tokenizers output the same tokens?u   🔥u   💩zTokenization does not match!c                     g | ]
}|d          S rb   rd   re   s     r4   rh   z5convert_esm_checkpoint_to_pytorch.<locals>.<listcomp>J  s    ,K,K,KSV,K,K,Kr@   )rn   ro   output_hidden_stateslogitsi  )repr_layers	positions)atolzmax_absolute_diff = z'Do both models output the same tensors?zSomething went wRoNgzContact prediction testing:)parentsexist_okzSaving model to zSaving tokenizer to )repo_idtoken_tokenrd   )o
startswithMODEL_MAPPINGevalesm	embed_dim
num_layersattention_headsrY   r   cfgitemshasattrsetattrrJ   rK   rL   layersffn_embed_dimrX   alphabettupleall_toksmask_idxpadding_idxr   embed_tokensnum_embeddingsclassification_headsout_projweightshape
num_labelsprintr   r
   r	   
embeddingsword_embeddingsembed_positionsposition_embeddings
layer_normbiasemb_layer_norm_afterencoderrangerQ   layer	attentionself	self_attnk_projdataq_projv_projtorchSizerP   querykeyvaluegetattrr_   inv_freqrotary_embeddingsself_attn_layer_norm	LayerNormfinal_layer_normoutputdenseintermediatefc1fc2esm_s_combine
af2_to_esmr?   	embedding	esm_s_mlpdistogram_headptm_headlm_head	lddt_head
classifierdecodercontact_headSAMPLE_DATAr5   esmfold_encode_sequencesallget_batch_converterr   r,   r   r.   r   r/   	Exceptionno_gradcudainferrA   extract_featureslistmaxabsitemallclosepredict_contactspathlibmkdirsave_pretrainedpush_to_hub)0rA   rB   rC   rD   rE   r~   r   r   r   rR   rS   rY   rX   rZ   r[   r\   r   valr]   rO   r0   original_esm_modelconfigmodel_classir   	esm_layerr   self_outputr   bert_outputsample_datar3   	hf_tokensesmfold_aasesmfold_mask_successbatch_converterbatch_labels
batch_strsbatch_tokensr1   r2   r+   their_output
our_outputmax_absolute_diffs0                                                   r4   !convert_esm_checkpoint_to_pytorchr   _   s    	"" /E"$$%e,..XHHJJJ	"" (G%	W'
!g5	M- %"*& 	2 	2HC~s++ 2wS111++-- 	8 	8HC~+S11 8c=O6O6O,c37776<<>> 	I 	IHC~+<cBB I,=sCHHH	I 
f		 H&	X_
!h6H2.(+(A Lu",  M	^
!1	M) %"*  $7#x())J%M'L ! W    %2AA#m I %*	
 0/ ,+ !% t &)S  C "\ 43 $m !8 7 *)  &~!" :#F&  V4V<ELRSTU	
V$$$	"" %*	 %2$KE	JJLLL 3E2Q2XEI(/*,,:L:\:c	07# ]1C1Y1`	'./A/W/\	',4F4[4bEI*12D2Y2^EI*/6+,, 54 54)+1!4&-a0	 ',o&:	&-28")05;D D D D")05;D D D D z6-v/ABCCD D D D D D '0&9&@&G	#$-$7$>$C	!$-$7$>$E	!"+"5"<"A	&/&9&@&G	#$-$7$>$C	!9&	488D 9B8K8S8\I'05 ,5+I+P!()2)G)L!&!*!;!B(9> &+_%; '-1D1M1T1ZZZZZ#,#6#?#F !*!4!=!B ).(:!(.)-2F2LLLLL$-M$8!"+-"4 "' '-1E1KKKKK#,=#7 !*!3  .#&#4#9  # 3"3=%/BBB"3=%/BBB"39ek:::"3#5u7KLLL"3<@@@"3;>>>"3=%/BBBB	 .(+(DV(L(R(Y%&)&>v&F&L&Q#+.+CF+K+T+[!(),)A&)I)R)W!&& &)[%6%="#&;#4#9 *-+*@*G '(+(>(C %'*{'9$ [- s/1GHHH  "!"1"o! D,.. L++{+++D$ch
 
 
	 .FFeFeYdFeFeFe-f-f*\1a)Ik2kABB 
uy&'<7H
 H

 #66881@1M1M.j,!! 	DWIIh/00Eg4J!!%((('3z??CCCL		D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D !L!@!@K!@!@!@QU_cddd	)Ik2lBCC	
=?\vvV\]]] 86777	 3 3 	6 88::++,K,K{,K,K,KLLL%#K05577	RbHcHhHhHjHj  JJ FFFFFFFJ#H-J" 6Esy=fEcFZFZ[gFhFhii"s9[#9tERUJJGWGWXXX+H5 	J %	%)J{4Kl[fNg4g*h*h i i n n p pnZ%<l;>W^bcccGG %	%)J4M*N*N O O T T V VnZDIIIG8%688999779VPVWWW 	42333 	8//	+0F	RbHcddJ//	+0FGGL %	%)J4M*N*N O O T T V VnZDIIIG/000<):<<===;w=ZVVTZ[[[ 8 6777-..44TD4QQQ;!9;;<<<6777g3 3 3 3 3 3 3 3 3 3 3 3 3 3 3j 

;!9
;
;<<<  !9::: O,JGGG  : NNNNNO Os&   2Akk!$k!Ly>>zz__main__z--pytorch_dump_folder_pathTz!Path to the output PyTorch model.)typerequiredhelpz--classification_head
store_truez/Whether to convert a final classification head.)actionr   z--modelzName of model to convert.)defaultr   r   r   z--push_to_repoz(Repo to upload to (including username!).)r   r   z--auth_tokenzHuggingFace auth token.)E__doc__argparser   r   tempfiler   r~   
esm_moduler   esm.esmfold.v1.miscr   r   esm.esmfold.v1.pretrainedr   )transformers.models.esm.configuration_esmr   r   $transformers.models.esm.modeling_esmr	   r
   r   r   r   r   r   (transformers.models.esm.modeling_esmfoldr   (transformers.models.esm.tokenization_esmr   transformers.utilsr   set_verbosity_info
get_logger__name__loggerr   
pretrainedr   r   r   r   r   r   r   r    r!   r"   r#   r$   r|   r   restypesrestypes_with_xr-   r5   r?   r/   boolr   ArgumentParserparseradd_argument
parse_argsrL   rA   rB   rC   rD   rE   rd   r@   r4   <module>r     s             ' ' ' ' ' '      R R R R R R 0 0 0 0 0 0 N N N N N N N N                  J I I I I I A A A A A A & & & & & &     		H	%	%   '1F(3J(3J(3J(3J(3J$/B#.@%0D%0D$/B"->   4&''cU"&)W)W)WW   G G G^O^O*-^ODH^OX[^Oil^O ^O ^O ^OB	 z$X$&&F
$3Dg     ;l     	4cDOjkkk
(s9cddd
S7PQQQD%%
D143KTM^`d`o     r@   