
    Ng!                        d Z ddlmZ ddlmZmZmZ ddlmZm	Z	m
Z
mZmZmZmZ ddlmZ ddlmZ ddddddddddZddZ G d de
          ZdS )zdImplementation of chunking by title.

Main entry point is the `@add_chunking_strategy()` decorator.
    )annotations)IterableIteratorOptional)CHUNK_MULTI_PAGE_DEFAULTBoundaryPredicateChunkingOptionsPreChunkCombiner
PreChunkeris_on_next_pageis_title)Element)lazypropertyNcombine_text_under_n_charsinclude_orig_elementsmax_charactersmultipage_sectionsnew_after_n_charsoverlapoverlap_allelementsIterable[Element]r   Optional[int]r   Optional[bool]r   r   r   r   r   returnlist[Element]c          	     d    t                               |||||||          }t          | |          S )a	  Uses title elements to identify sections within the document for chunking.

    Splits off into a new CompositeElement when a title is detected or if metadata changes, which
    happens when page numbers or sections change. Cuts off sections once they have exceeded a
    character length of max_characters.

    Parameters
    ----------
    elements
        A list of unstructured elements. Usually the output of a partition function.
    combine_text_under_n_chars
        Combines elements (for example a series of titles) until a section reaches a length of
        n characters. Defaults to `max_characters` which combines chunks whenever space allows.
        Specifying 0 for this argument suppresses combining of small chunks. Note this value is
        "capped" at the `new_after_n_chars` value since a value higher than that would not change
        this parameter's effect.
    include_orig_elements
        When `True` (default), add elements from pre-chunk to the `.metadata.orig_elements` field
        of the chunk(s) formed from that pre-chunk. Among other things, this allows access to
        original-element metadata that cannot be consolidated and is dropped in the course of
        chunking.
    max_characters
        Chunks elements text and text_as_html (if present) into chunks of length
        n characters (hard max)
    multipage_sections
        If True, sections can span multiple pages. Defaults to True.
    new_after_n_chars
        Cuts off new sections once they reach a length of n characters (soft max). Defaults to
        `max_characters` when not specified, which effectively disables any soft window.
        Specifying 0 for this argument causes each element to appear in a chunk by itself (although
        an element with text longer than `max_characters` will be still be split into two or more
        chunks).
    overlap
        Specifies the length of a string ("tail") to be drawn from each chunk and prefixed to the
        next chunk as a context-preserving mechanism. By default, this only applies to split-chunks
        where an oversized element is divided into multiple chunks by text-splitting.
    overlap_all
        Default: `False`. When `True`, apply overlap between "normal" chunks formed from whole
        elements and not subject to text-splitting. Use this with caution as it entails a certain
        level of "pollution" of otherwise clean semantic chunk boundaries.
    r   )_ByTitleChunkingOptionsnew_chunk_by_title)	r   r   r   r   r   r   r   r   optss	            W/var/www/html/ai-engine/env/lib/python3.11/site-packages/unstructured/chunking/title.pychunk_by_titler$      sF    h #&&#=3%-+ '  D 8T***    r"   r   c                    t          t          j        | |          |                                          }d |D             S )z-Implementation of actual "by-title" chunking.)r"   c                @    g | ]}|                                 D ]}|S  )iter_chunks).0	pre_chunkchunks      r#   
<listcomp>z#_chunk_by_title.<locals>.<listcomp>_   s1    SSSi9;P;P;R;RSS%ESSSSr%   )r
   r   iter_pre_chunksiter_combined_pre_chunks)r   r"   
pre_chunkss      r#   r!   r!   W   sP     ""8T22      TS:SSSSr%   c                  l     e Zd ZdZedd            Zedd            Zedd            Zd fd
Z xZ	S )r   a9  Adds the by-title-specific chunking options to the base case.

    `by_title`-specific options:

    combine_text_under_n_chars
        A remedy to over-chunking caused by elements mis-identified as Title elements.
        Every Title element would start a new chunk and this setting mitigates that, at the
        expense of sometimes violating legitimate semantic boundaries.
    multipage_sections
        Indicates that page-boundaries should not be respected while chunking, i.e. elements
        appearing on two different pages can appear in the same chunk.
    r   tuple[BoundaryPredicate, ...]c                >     d fd}t           |                      S )a(  The semantic-boundary detectors to be applied to break pre-chunks.

        For the `by_title` strategy these are sections indicated by a title (section-heading), an
        explicit section metadata item (only present for certain document types), and optionally
        page boundaries.
        r   Iterator[BoundaryPredicate]c               3  P   K   t           V   j        st                      V  d S d S )N)r   r   r   )selfs   r#   iter_boundary_predicateszM_ByTitleChunkingOptions.boundary_predicates.<locals>.iter_boundary_predicatesy   s=      NNN* (%'''''''( (r%   )r   r4   )tuple)r6   r7   s   ` r#   boundary_predicatesz+_ByTitleChunkingOptions.boundary_predicatesp   s;    	( 	( 	( 	( 	( 	(
 --//000r%   intc                L    | j                             d          }|| j        n|S )a  Combine consecutive text pre-chunks if former is smaller than this and both will fit.

        - Does not combine table chunks with text chunks even if they would both fit in the
          chunking window.
        - Does not combine text chunks if together they would exceed the chunking window.
        - Defaults to `max_characters` when not specified.
        - Is reduced to `new_after_n_chars` when it exceeds that value.
        r   )_kwargsgethard_maxr6   	arg_values     r#   r   z2_ByTitleChunkingOptions.combine_text_under_n_chars   s+     L$$%ABB	 ) 1t}}y@r%   boolc                f    | j                             d          }|t          nt          |          S )z0When False, break pre-chunks on page-boundaries.r   )r<   r=   r   rA   r?   s     r#   r   z*_ByTitleChunkingOptions.multipage_sections   s1     L$$%9::	+4+<''$y//Qr%   Nonec                    t                                                       | j        dk     rt          d| j                   | j        | j        k    rt          d| j         d| j                   dS )z2Raise ValueError if request option-set is invalid.r   z8'combine_text_under_n_chars' argument must be >= 0, got zR'combine_text_under_n_chars' argument must not exceed `max_characters` value, got z > N)super	_validater   
ValueErrorr>   )r6   	__class__s    r#   rF   z!_ByTitleChunkingOptions._validate   s     	 *Q..:7: :   *T]::S#>S SCG=S S   ;:r%   )r   r2   )r   r:   )r   rA   )r   rC   )
__name__
__module____qualname____doc__r   r9   r   r   rF   __classcell__)rH   s   @r#   r   r   b   s          1 1 1 \1 A A A \A R R R \R
         r%   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )r   r   r"   r   r   r   )rL   
__future__r   typingr   r   r   unstructured.chunking.baser   r   r	   r
   r   r   r   unstructured.documents.elementsr   unstructured.utilsr   r$   r!   r   r(   r%   r#   <module>rS      sM   
 # " " " " " / / / / / / / / / /                  4 3 3 3 3 3 + + + + + + 15,0$()-'+!"&=+ =+ =+ =+ =+ =+@T T T TG G G G Go G G G G Gr%   