
    g!9                        d dl Z d dlZd dlZd dlmZ d dlmZmZmZm	Z	m
Z
mZmZ d dlmZmZ ej                            e          Zej                            ed          Z ee          5 Ze                                Zddd           n# 1 swxY w Y    G d d          Ze	e         Ze	ee                  Z G d d	e
          Z G d
 d          Z G d d          Z G d d          Z efde	e!         de!fdZ"dS )    N)Template)AnyCallableDictList
NamedTupleOptionalTuple)Encoding	Tokenizerzvisualizer-styles.cssc                   B    e Zd ZU eed<   eed<   eed<   dededefdZdS )
Annotationstartendlabelc                 0    || _         || _        || _        d S N)r   r   r   )selfr   r   r   s       W/var/www/html/ai-engine/env/lib/python3.11/site-packages/tokenizers/tools/visualizer.py__init__zAnnotation.__init__   s    



    N)__name__
__module____qualname__int__annotations__strr    r   r   r   r      sZ         JJJ	HHHJJJc  C      r   r   c                   <    e Zd ZU ee         ed<   ee         ed<   dS )CharStateKeytoken_ixanno_ixN)r   r   r   r	   r   r   r   r   r   r    r       s2         smc]r   r    c                   d    e Zd ZU ee         ed<   d Zed             Zed             Z	de
fdZdS )	CharStatechar_ixc                 0    || _         d | _        g | _        d S r   )r%   r"   tokens)r   r%   s     r   r   zCharState.__init__'   s    &*!#r   c                 P    t          | j                  dk    r| j        d         nd S )Nr   lenr'   r   s    r   r!   zCharState.token_ix-   s&    !$T[!1!1A!5!5t{1~~4?r   c                 2    t          | j                  dk    S )zJ
        BPE tokenizers can output more than one token for a char
           r)   r+   s    r   is_multitokenzCharState.is_multitoken1   s    
 4;!##r   returnc                 8    t          | j        | j                  S )N)r!   r"   )r    r!   r"   r+   s    r   partition_keyzCharState.partition_key8   s#    ]L
 
 
 	
r   N)r   r   r   r	   r   r   r   propertyr!   r.   r    r1   r   r   r   r$   r$   $   s         c]$ $ $ @ @ X@ $ $ X$
| 
 
 
 
 
 
r   r$   c                       e Zd ZdS )AlignedN)r   r   r   r   r   r   r4   r4   ?   s        Dr   r4   c            
          e Zd ZdZ ej        dej                  Z	 	 ddede	de
eegef                  fd	Zg dfd
edede
e	         de
e         fdZededeeef         fd            Zedee         d
edefd            Zed
edededefd            Zed
ededefd            Zed
edededee         fd            ZdS )EncodingVisualizera  
    Build an EncodingVisualizer

    Args:

         tokenizer (:class:`~tokenizers.Tokenizer`):
            A tokenizer instance

         default_to_notebook (:obj:`bool`):
            Whether to render html output in a notebook by default

         annotation_converter (:obj:`Callable`, `optional`):
            An optional (lambda) function that takes an annotation in any format and returns
            an Annotation object
    z(.{1})?(unk|oov)(.{1})?)flagsTN	tokenizerdefault_to_notebookannotation_converterc                     |r'	 ddl m}m} n# t          $ r t	          d          w xY w|| _        || _        || _        d S )Nr   HTMLdisplayzWe couldn't import IPython utils for html display.
                        Are you running in a notebook?
                        You can also pass `default_to_notebook=False` to get back raw HTML
                    )IPython.core.displayr=   r>   ImportError	Exceptionr8   r9   annotation_coverter)r   r8   r9   r:   r=   r>   s         r   r   zEncodingVisualizer.__init__V   s|      		>>>>>>>>>      ##6 #7 s    'textannotationsr/   c                 \   | j         }||}|r'	 ddlm}m} n# t          $ r t          d          w xY w| j        "t          t          | j        |                    }| j	        
                    |          }t                              |||          }|r | ||                     dS |S )a  
        Build a visualization of the given text

        Args:
            text (:obj:`str`):
                The text to tokenize

            annotations (:obj:`List[Annotation]`, `optional`):
                An optional list of annotations of the text. The can either be an annotation class
                or anything else if you instantiated the visualizer with a converter function

            default_to_notebook (:obj:`bool`, `optional`, defaults to `False`):
                If True, will render the html in a notebook. Otherwise returns an html string.

        Returns:
            The HTML string if default_to_notebook is False, otherwise (default) returns None and
            renders the HTML in the notebook

        Nr   r<   zeWe couldn't import IPython utils for html display.
                    Are you running in a notebook?)r9   r?   r=   r>   r@   rA   rB   listmapr8   encoder6   _EncodingVisualizer__make_html)	r   rC   rD   r9   final_default_to_notebookr=   r>   encodinghtmls	            r   __call__zEncodingVisualizer.__call__l   s    2 %)$<!*(;%$ 	>>>>>>>>>   6  
 #/s4#;[IIJJK>((..!--dHkJJ$ 	GDDJJKs    2c                    t          |           dk    ri S t          t          d |                     }t          |          }t          d|z            }|dk     rd}d}d}d}i }t	          |          D ]}d| d	| d
| d||<   ||z  }|S )a  
        Generates a color palette for all the labels in a given set of annotations

        Args:
          annotations (:obj:`Annotation`):
            A list of annotations

        Returns:
            :obj:`dict`: A dictionary mapping labels to colors in HSL format
        r   c                     | j         S r   )r   )xs    r   <lambda>z;EncodingVisualizer.calculate_label_colors.<locals>.<lambda>   s    17 r             @   
   zhsl(,z%,%)r*   setrG   r   sorted)	rD   labels
num_labelsh_stepslhcolorsr   s	            r   calculate_label_colorsz)EncodingVisualizer.calculate_label_colors   s     {q  IS**K8899[[
S:%&&B;;FF^^ 	 	E0100q00A000F5MKAAr   consecutive_chars_listrK   c                    | d         }|j         |j        |j                 }d| dS | d         }|j         }|j         dz   }|||         }g }	i }
|j        |	                    d           |j        r|	                    d           |j        d	z  r|	                    d
           n|	                    d           t
          j                            |j        |j                           *|	                    d           |j        |j                 |
d<   n|	                    d           dd                    |	           d}d}|
	                                D ]\  }}|d| d| dz  }d| d| d| dS )a  
        Converts a list of "consecutive chars" into a single HTML element.
        Chars are consecutive if they fall under the same word, token and annotation.
        The CharState class is a named tuple with a "partition_key" method that makes it easy to
        compare if two chars are consecutive.

        Args:
            consecutive_chars_list (:obj:`List[CharState]`):
                A list of CharStates that have been grouped together

            text (:obj:`str`):
                The original text being processed

            encoding (:class:`~tokenizers.Encoding`):
                The encoding returned from the tokenizer

        Returns:
            :obj:`str`: The HTML span for a set of consecutive chars
        r   Nz(<span class="special-token" data-stoken=z></span>r-   tokenzmulti-token   z	odd-tokenz
even-tokenzspecial-tokenstokz	non-tokenzclass=" " z data-z="z<span z ></span>)
r%   r'   r!   appendr.   r6   unk_token_regexsearchjoinitems)rc   rC   rK   firststokenlastr   r   	span_textcss_classes
data_itemscssdatakeyvals                  r   consecutive_chars_to_htmlz,EncodingVisualizer.consecutive_chars_to_html   s   2 'q)= _U^4F OfNNNN%b)lQsO	
>%w'''" 2""=111~! 1
 "";//// ""<000!1889XYYe""?333%-_U^%D
6" {+++4#((;//444"((** 	+ 	+HC*S**C****DD888d88i8888r   c                    t                               | ||          }|d         g}|d         j        }g }t                               |          }|d         j        }|3||         }	|	j        }
||
         }|                    d| d|
 d           |dd          D ]
}|j        }||k    r|                    t                               || |                     |g}||                    d           |3||         }	|	j        }
||
         }|                    d| d|
 d           |}|                                |d                                         k    r|                    |           |                    t                               || |                     |g}|                    t                               || |                     t          |          }|S )Nr   z&<span class="annotation" style="color:z" data-label="z">r-   )rC   rK   rl   )	r6   %_EncodingVisualizer__make_char_statesr"   rb   r   rm   r|   r1   HTMLBody)rC   rK   rD   char_statescurrent_consecutive_charsprev_anno_ixspanslabel_colors_dictcur_anno_ixannor   colorcsress                 r   __make_htmlzEncodingVisualizer.__make_html   sA   (;;D(KXX%0^$4!"1~-.EEkRR!!n,"{+DJE%e,ELL`%``W\```aaaabb/ &	1 &	1B*Kl**&@@1!!) A     .0D)+LL+++*&{3D JE-e4ELL!h%!h!h_d!h!h!hiii&L!!%>q%A%O%O%Q%QQQ)004444 &@@1!!) A     .0D)) 	88)! 9  	
 	
 	
 uoo
r   c                     dgt          |           z  }t          |          D ]'\  }}t          |j        |j                  D ]}|||<   (|S )a  
        Args:
            text (:obj:`str`):
                The raw text we want to align to

            annotations (:obj:`AnnotationList`):
                A (possibly empty) list of annotations

        Returns:
            A list of  length len(text) whose entry at index i is None if there is no annotation on
            charachter i or k, the index of the annotation that covers index i where k is with
            respect to the list of annotations
        N)r*   	enumerateranger   r   )rC   rD   annotation_mapr"   ais         r   __make_anno_mapz"EncodingVisualizer.__make_anno_map<  sf     #d))+#K00 	, 	,JGQ17AE** , ,$+q!!,r   c                    t                               | |          }d t          t          |                     D             }t	          |j                  D ]T\  }}|                    |          }|8|\  }}	t          ||	          D ]"}
||
         j                            |           #Ut	          |          D ]\  }}|||         _        |S )a  
        For each character in the original text, we emit a tuple representing it's "state":

            * which token_ix it corresponds to
            * which word_ix it corresponds to
            * which annotation_ix it corresponds to

        Args:
            text (:obj:`str`):
                The raw text we want to align to

            annotations (:obj:`List[Annotation]`):
                A (possibly empty) list of annotations

            encoding: (:class:`~tokenizers.Encoding`):
                The encoding returned from the tokenizer

        Returns:
            :obj:`List[CharState]`: A list of CharStates, indicating for each char in the text what
            it's state is
        c                 ,    g | ]}t          |          S r   )r$   ).0r%   s     r   
<listcomp>z9EncodingVisualizer.__make_char_states.<locals>.<listcomp>j  s     '['['[w	'(:(:'['['[r   )	r6   "_EncodingVisualizer__make_anno_mapr   r*   r   r'   token_to_charsrm   r"   )rC   rK   rD   r   r   r!   rf   offsetsr   r   r   r%   r"   s                r   __make_char_statesz%EncodingVisualizer.__make_char_statesQ  s    . ,;;D+NN'['[%PSTXPYPYJZJZ'['['[(99 	; 	;OHe--h77G"$
suc** ; ;AN)00:::: ). 9 9 	3 	3GW+2K ((r   )TN)r   r   r   __doc__recompile
IGNORECASErn   r   boolr	   r   r   r   r   r   AnnotationListrM   staticmethodr   rb   r   r$   r   r|   rI   PartialIntListr   r~   r   r   r   r6   r6   C   s	          !bj!>bmTTTO
 %)FJ	  " 'xz0A'BC	   2 ').2	+ ++ $+ &d^	+
 
#+ + + +Z N tCH~    \8 A9 $YA9A9 A9 A9 A9 \A9F ?# ? ? ?SV ? ? ? \?B c  >    \( " " "~ "Z^_hZi " " " \" " "r   r6   childrenr/   c                 >    d                     |           }d| d| dS )a[  
    Generates the full html with css from a list of html spans

    Args:
        children (:obj:`List[str]`):
            A list of strings, assumed to be html elements

        css_styles (:obj:`str`, `optional`):
            Optional alternative implementation of the css

    Returns:
        :obj:`str`: An HTML string with style markup
    rk   z?
    <html>
        <head>
            <style>
                zs
            </style>
        </head>
        <body>
            <div class="tokenized-text" dir=auto>
            z4
            </div>
        </body>
    </html>
    )rp   )r   
css_styleschildren_texts      r   r   r   w  sB     GGH%%M 	     r   )#	itertoolsosr   stringr   typingr   r   r   r   r   r	   r
   
tokenizersr   r   pathdirname__file__rp   css_filenameopenfreadrx   r   r   r   r   r    r$   r4   r6   r   r   r   r   r   <module>r      sD       				 				       I I I I I I I I I I I I I I I I I I * * * * * * * * '//(
#
#w||G%<==	T, 1
&&((C                      j!hsm$    :   

 
 
 
 
 
 
 
6	 	 	 	 	 	 	 	q q q q q q q qh	 .1  tCy S      s   -BBB