§
    íNg™  ã                  ó|   — d Z ddlmZ ddlmZmZ ddlmZmZ ddl	m
Z
 ddddddœdd„Zdd„Z G d„ de¦  «        ZdS )a“  Implementation of baseline chunking.

This is the "plain-vanilla" chunking strategy. All the fundamental chunking behaviors are present in
this strategy and also in all other strategies. Those are:

- Maximally fill each chunk with sequential elements.
- Isolate oversized elements and divide (only) those chunks by text-splitting.
- Overlap when requested.

"Fancier" strategies add higher-level semantic-unit boundaries to be respected. For example, in the
by-title strategy, section boundaries are respected, meaning a chunk never contains text from two
different sections. When a new section is detected the current chunk is closed and a new one
started.
é    )Úannotations)ÚIterableÚOptional)ÚChunkingOptionsÚ
PreChunker)ÚElementN©Úinclude_orig_elementsÚmax_charactersÚnew_after_n_charsÚoverlapÚoverlap_allÚelementsúIterable[Element]r
   úOptional[bool]r   úOptional[int]r   r   r   Úreturnúlist[Element]c               ó`   — t                                |||||¬¦  «        }t          | |¦  «        S )av  Combine sequential `elements` into chunks, respecting specified text-length limits.

    Produces a sequence of `CompositeElement`, `Table`, and `TableChunk` elements (chunks).

    Parameters
    ----------
    elements
        A list of unstructured elements. Usually the output of a partition function.
    include_orig_elements
        When `True` (default), add elements from pre-chunk to the `.metadata.orig_elements` field
        of the chunk(s) formed from that pre-chunk. Among other things, this allows access to
        original-element metadata that cannot be consolidated and is dropped in the course of
        chunking.
    max_characters
        Hard maximum chunk length. No chunk will exceed this length. A single element that exceeds
        this length will be divided into two or more chunks using text-splitting.
    new_after_n_chars
        A chunk that of this length or greater is not extended to include the next element, even if
        that element would fit without exceeding `max_characters`. A "soft max" length that can be
        used in conjunction with `max_characters` to limit most chunks to a preferred length while
        still allowing larger elements to be included in a single chunk without resorting to
        text-splitting. Defaults to `max_characters` when not specified, which effectively disables
        any soft window. Specifying 0 for this argument causes each element to appear in a chunk by
        itself (although an element with text longer than `max_characters` will be still be split
        into two or more chunks).
    overlap
        Specifies the length of a string ("tail") to be drawn from each chunk and prefixed to the
        next chunk as a context-preserving mechanism. By default, this only applies to split-chunks
        where an oversized element is divided into multiple chunks by text-splitting.
    overlap_all
        Default: `False`. When `True`, apply overlap between "normal" chunks formed from whole
        elements and not subject to text-splitting. Use this with caution as it produces a certain
        level of "pollution" of otherwise clean semantic chunk boundaries.
    r	   )Ú_BasicChunkingOptionsÚnewÚ_chunk_elements)r   r
   r   r   r   r   Úoptss          úW/var/www/html/ai-engine/env/lib/python3.11/site-packages/unstructured/chunking/basic.pyÚchunk_elementsr      s@   € õX !×$Ò$Ø3Ø%Ø+ØØð %ñ ô €Dõ ˜8 TÑ*Ô*Ð*ó    r   r   c                ó@   — d„ t          j        | |¦  «        D ¦   «         S )z(Implementation of actual basic chunking.c                ó@   — g | ]}|                      ¦   «         D ]}|‘ŒŒS © )Úiter_chunks)Ú.0Ú	pre_chunkÚchunks      r   ú
<listcomp>z#_chunk_elements.<locals>.<listcomp>S   sK   € ð ð ð àØ×*Ò*Ñ,Ô,ðð ð ð 	ðð ð ð r   )r   Úiter_pre_chunks)r   r   s     r   r   r   O   s0   € ðð å#Ô3°H¸dÑCÔCðñ ô ð r   c                  ó   — e Zd ZdZdS )r   zOptions for `basic` chunking.N)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r   r   r   r   Z   s   € € € € € Ø'Ð'Ð'Ð'r   )r   r   r
   r   r   r   r   r   r   r   r   r   r   r   )r   r   r   r   r   r   )r*   Ú
__future__r   Útypingr   r   Úunstructured.chunking.baser   r   Úunstructured.documents.elementsr   r   r   r   r   r   r   ú<module>r/      sÝ   ððð ð #Ð "Ð "Ð "Ð "Ð "à %Ð %Ð %Ð %Ð %Ð %Ð %Ð %à BÐ BÐ BÐ BÐ BÐ BÐ BÐ BØ 3Ð 3Ð 3Ð 3Ð 3Ð 3ð -1Ø$(Ø'+Ø!Ø"&ð4+ð 4+ð 4+ð 4+ð 4+ð 4+ðnð ð ð ð(ð (ð (ð (ð (˜Oñ (ô (ð (ð (ð (r   