Rem
Rem $Header: dbgendev/src/langdata/plsql/named-entities/named_entities_pkg.pkb /main/22 2025/08/13 01:29:21 jiangnhu Exp $
Rem
Rem named_entities_pkg.pkb
Rem
Rem Copyright (c) 2024, 2025, Oracle and/or its affiliates.
Rem
Rem    NAME
Rem      named_entities_pkg.pkb - Named Entities Package
Rem
Rem    DESCRIPTION
Rem      Package Body for named entities package, Contains procedures, 
Rem      get_all_named_entities and add_named_entity in the Named Entities 
Rem      package
Rem
Rem    NOTES
Rem      None
Rem
Rem    BEGIN SQL_FILE_METADATA
Rem    SQL_SOURCE_FILE: dbgendev/src/langdata/plsql/named-entities/named_entities_pkg.pkb
Rem    SQL_SHIPPED_FILE:
Rem    SQL_PHASE:
Rem    SQL_STARTUP_MODE: NORMAL
Rem    SQL_IGNORABLE_ERRORS: NONE
Rem    END SQL_FILE_METADATA
Rem
Rem    MODIFIED   (MM/DD/YY)
Rem    deveverm    07/28/25 - modified v_model_cache to varchar2(4000)
Rem    jiangnhu    07/25/25 - DBAI-1128: Add null check for 'filters' in
Rem                           add_named_entities_to_document
Rem    dadoshi     07/07/25 - JIRA_DBAI1022: Remove
Rem                           c_unique_constraint_violated exception usage
Rem    arevathi    07/01/25 - Point cache dir to DBFS
Rem    fgurrola    06/27/25 - Moving initialization call to setup_pkg.
Rem    fgurrola    06/10/25 - Muting debugging info
Rem    deveverm    06/05/25 - change model_cache to dbs, fix bug on ner_call
Rem    deveverm    06/02/25 - DBAI-794: shifted from PLSQL-SDK to DBMS_CLOUD
Rem    pryarla     05/15/25 - DBAI-737: Use LLM for NER based on config
Rem    dadoshi     05/15/25 - Remove get_entity_using_ner() procedure
Rem    dadoshi     05/15/25 - JIRA_DBAI804: Store context embeddings as CLOB
Rem    dadoshi     05/13/25 - JIRA_DBAI804: Add support for identifying
Rem                           multiple entities of the same type
Rem    deveverm    05/06/25 - DBAI-752: added cache_dir parameter to 
Rem                           process_entities call
Rem    arevathi    04/18/25 - Removing app-expert role check on
Rem                           get_all_named_entities as it is accessed by
Rem                           search_from_query
Rem    deveverm    04/10/25 - DBAI-723: Refactored all functions calling NER to
Rem                           named_entities_pkg
Rem    jiangnhu    04/04/25 - Move get_entities_from_text from utils pkg to 
Rem                           named entities pkg to avoid mutual dependency
Rem    dadoshi     03/11/25 - JIRA_DBAI574: Add add_named_entities_to_document
Rem                           API usage
Rem    dadoshi     03/11/25 - Update add_named_entity to take id as an optional
Rem                           argument.
Rem    arevathi    10/21/24 - Modified Headers
Rem    pryarla     10/16/24 - Created
Rem

CREATE OR REPLACE PACKAGE BODY lang_data_named_entities_pkg IS

    PROCEDURE initialize_named_entities IS
    BEGIN
        INSERT INTO langdata$named_entities (id, name)
        VALUES 
            (lang_data_utils_pkg.generate_id(), 'currency'),
            (lang_data_utils_pkg.generate_id(), 'date'),
            (lang_data_utils_pkg.generate_id(), 'email'),
            (lang_data_utils_pkg.generate_id(), 'event'),
            (lang_data_utils_pkg.generate_id(), 'facility'),
            (lang_data_utils_pkg.generate_id(), 'gpe'),
            (lang_data_utils_pkg.generate_id(), 'duration'),
            (lang_data_utils_pkg.generate_id(), 'ipaddress'),
            (lang_data_utils_pkg.generate_id(), 'number'),
            (lang_data_utils_pkg.generate_id(), 'organization'),
            (lang_data_utils_pkg.generate_id(), 'percentage'),
            (lang_data_utils_pkg.generate_id(), 'person'),
            (lang_data_utils_pkg.generate_id(), 'phonenumber'),
            (lang_data_utils_pkg.generate_id(), 'product'),
            (lang_data_utils_pkg.generate_id(), 'time'),
            (lang_data_utils_pkg.generate_id(), 'url');
    END initialize_named_entities;

    PROCEDURE get_all_named_entities (
        p_named_entities OUT SYS_REFCURSOR
    ) 
    AS
    BEGIN
        -- Open the cursor and return the p_named_entities set
        OPEN p_named_entities FOR
        SELECT id, name
        FROM langdata$named_entities;

    EXCEPTION
        WHEN OTHERS THEN
            -- Handle all exceptions in a proper block
            lang_data_logger_pkg.log_fatal(
                'An unknown error occurred while retrieving entities. Error: ' 
                || SQLERRM
            );
            RAISE;
    END get_all_named_entities;


    PROCEDURE add_named_entities_to_document (
        p_match_document    IN JSON
    )
    IS
        v_json              JSON_OBJECT_T;
        v_filters           JSON_ARRAY_T;
        v_filter_item       JSON_OBJECT_T;
        v_entity_type       VARCHAR2(255);
        v_cnt               NUMBER;
    BEGIN
        IF lang_data_config_pkg.g_custom_entities = FALSE THEN
            lang_data_logger_pkg.log_error(
                'Adding named entity is not supported.'
            );
            lang_data_errors_pkg.raise_error(
                lang_data_errors_pkg.c_unsupported_entity_type
            );
        END IF;


        -- Convert CLOB to JSON
        v_json := JSON_OBJECT_T.parse(json_serialize(p_match_document));

        -- Get the 'filters' array
        v_filters := v_json.GET_ARRAY('filters');

        IF v_filters IS NULL THEN
            lang_data_logger_pkg.log_debug('No filters in match document');
            RETURN;
        END IF;

        FOR i IN 0 .. v_filters.get_size - 1
        LOOP
            v_filter_item := JSON_OBJECT_T(v_filters.GET(i));
            IF v_filter_item.GET_BOOLEAN('use_ner') THEN
                v_entity_type := v_filter_item.get_string('entity_type');
                
                SELECT count(*)
                INTO v_cnt
                FROM langdata$named_entities
                WHERE name=v_entity_type;

                IF v_cnt = 0 THEN
                    add_named_entity(
                        p_name => v_entity_type
                    );
                END IF;
            END IF;
        END LOOP;
        EXCEPTION
            WHEN OTHERS THEN
                IF SQLCODE IN (
                        lang_data_errors_pkg.c_unauthorized_code,
                        lang_data_errors_pkg.c_invalid_parameters_code,
                        lang_data_errors_pkg.c_max_text_length_exceeded
                    ) THEN
                    RAISE;
                END IF;
                lang_data_logger_pkg.log_fatal(
                    'An unknown error occurred while adding named entities.' ||
                    ' Error: ' || SQLERRM
                );
                RAISE;
    END;


    PROCEDURE add_named_entity (
        p_name  IN VARCHAR2,
        p_id    IN VARCHAR2 DEFAULT NULL
    ) 
    AS
        v_id    VARCHAR2(36);
    BEGIN
        IF lang_data_config_pkg.g_custom_entities = FALSE THEN
            lang_data_logger_pkg.log_error(
                'Adding named entity is not supported.'
            );
            lang_data_errors_pkg.raise_error(
                lang_data_errors_pkg.c_unsupported_entity_type
            );
        END IF;
        
        -- Check if User is Authorized to perform the action
        IF NOT lang_data_auth_pkg.is_role_enabled(
                lang_data_auth_pkg.c_lang_data_app_expert
            ) THEN
            lang_data_errors_pkg.raise_error(
                lang_data_errors_pkg.c_unauthorized_code
            );
        END IF;

        IF p_name IS NULL THEN
            lang_data_errors_pkg.raise_error(
                lang_data_errors_pkg.c_invalid_parameters_code
            );
        END IF;

        IF p_id IS NULL THEN
            v_id := lang_data_utils_pkg.generate_id();
        ELSE
            IF (LENGTH(p_id) > 36) THEN
                lang_data_errors_pkg.raise_error(
                    lang_data_errors_pkg.c_max_text_length_exceeded
                );
            ELSE
                v_id := p_id;
            END IF;
        END IF;

        IF (LENGTH(p_name) > 255) THEN
            lang_data_errors_pkg.raise_error(
                lang_data_errors_pkg.c_max_text_length_exceeded
            );
        END IF;

        -- Insert the new entity into the named entities table
        INSERT INTO langdata$named_entities (id, name) 
        VALUES (v_id, p_name);

    EXCEPTION
        WHEN DUP_VAL_ON_INDEX THEN
            -- Handle unique constraint violation on name
            lang_data_logger_pkg.log_error(
                'Entity with name ' || p_name || ' already exists'
            );
            lang_data_errors_pkg.raise_error(
                lang_data_errors_pkg.c_resource_already_exists
            );

        WHEN OTHERS THEN
            IF SQLCODE IN ( lang_data_errors_pkg.c_unauthorized_code,
                        lang_data_errors_pkg.c_invalid_parameters_code,
                        lang_data_errors_pkg.c_max_text_length_exceeded) THEN
                RAISE;
            END IF;

            -- Handle other exceptions
            lang_data_logger_pkg.log_fatal(
                'An unknown error occurred while adding ' 
                || p_name || ' entity. Error: ' || SQLERRM
            );
    END add_named_entity;

    FUNCTION ner_call(
        p_query VARCHAR2,
        p_labels VARCHAR2
    ) RETURN JSON
    IS
        v_is_adb    VARCHAR2(10);
        v_entities_json JSON;
        v_labels_lower VARCHAR2(3000) := LOWER(p_labels);
        v_pyq_json_object JSON_OBJECT_T := JSON_OBJECT_T();
        v_pyq_string VARCHAR2(4000);

        v_entities_array_response JSON_ARRAY_T;
        v_entity_response JSON_OBJECT_T;
        v_entity_json_obj JSON_OBJECT_T;
        v_entities_arr JSON_ARRAY_T := JSON_ARRAY_T('[]');

        v_cred_name     VARCHAR2(4000);
        v_base_url      varchar2(4000);
        v_full_resp     DBMS_CLOUD_TYPES.resp;
        v_req_body      json_object_t;
        v_req_headers   json_object_t;
        v_doc_json_obj  json_object_t;
        v_doc_json_array json_array_t;
        v_hash          VARCHAR2(4000);
        v_body_hash     VARCHAR2(4000);
        v_second_level_domain VARCHAR2(4000);
        v_compartment_id    VARCHAR2(4000);
        v_region_code       VARCHAR2(4000);
        v_response_json_obj JSON_OBJECT_T;
        v_model_cache       VARCHAR2(4000);
    BEGIN
        v_is_adb := lang_data_config_pkg.get_config_parameter(
            'LANG_DATA_IS_AUTONOMOUS_DB'
        );
        v_compartment_id := lang_data_config_pkg.get_config_parameter(
            'LANG_DATA_OCI_COMPARTMENT'
        );
        v_region_code := lang_data_config_pkg.get_config_parameter(
            'LANG_DATA_OCI_REGION'
        );
        v_cred_name := lang_data_config_pkg.get_config_parameter(
            'LANG_DATA_OCI_CRED'
        );

        -- If config use LLM for NER
        IF LOWER(
                lang_data_config_pkg.get_config_parameter(
                    'LANG_DATA_USE_LLM'
                )
            ) = 'true' THEN
            lang_data_logger_pkg.log_debug('Using LLM for NER');
            v_entities_json := lang_data_llm_pkg.detect_named_entities(
                p_query,
                p_labels
            );
        -- If ADB use OCI NER
        ELSIF v_is_adb = 'true' THEN
            -- Add checks
            IF v_compartment_id IS NULL THEN
                lang_data_logger_pkg.log_error(
                    '''LANG_DATA_OCI_COMPARTMENT'' is not set.'
                );
                lang_data_errors_pkg.raise_error(
                    lang_data_errors_pkg.c_OCI_compartment_not_set
                );
            END IF;
            IF v_region_code IS NULL THEN
                lang_data_logger_pkg.log_error(
                    '''LANG_DATA_OCI_REGION'' is not set.'
                );
                lang_data_errors_pkg.raise_error(
                    lang_data_errors_pkg.c_OCI_region_not_set
                );
            END IF;
            IF v_cred_name IS NULL THEN
                lang_data_logger_pkg.log_error(
                    '''LANG_DATA_OCI_CRED'' is not set.'
                );
                lang_data_errors_pkg.raise_error(
                    lang_data_errors_pkg.c_OCI_credential_not_set
                );
            END IF;

            -- Create doc json object
            v_doc_json_obj := JSON_OBJECT_T('{}');
            v_doc_json_obj.put('key', 'doc1');
            v_doc_json_obj.put('text', REPLACE(p_query, '''',''''''));

            -- Create doc array
            v_doc_json_array := JSON_ARRAY_T('[]');
            v_doc_json_array.append(v_doc_json_obj);

            -- Create details json
            v_req_body := JSON_OBJECT_T('{}');
            IF v_compartment_id IS NOT NULL THEN
                v_req_body.put('compartmentId', v_compartment_id);
            END IF;

            v_req_body.put('documents', v_doc_json_array);

            v_hash := DBMS_CRYPTO.hash(
                v_req_body.to_blob, DBMS_CRYPTO.HASH_SH256);
            v_body_hash := UTL_RAW.cast_to_varchar2(
                UTL_ENCODE.base64_encode(v_hash));

            v_req_headers := JSON_OBJECT_T('{}');
            v_req_headers.PUT('accept', 'application/json');
            v_req_headers.PUT('content-type', 'application/json');
            v_req_headers.PUT('opc-request-id',UPPER(sys_guid()));
            -- Emulating PLSQL SDK for now
            v_req_headers.PUT('opc-client-info','Oracle-PlsqlSDK/1.9');
            v_req_headers.PUT('x-content-sha256',v_body_hash);

            v_base_url :=  
                'https://language.aiservice.{region}.oci.{secondLevelDomain}/20221001/actions/batchDetectLanguageEntities';

            -- Using Endpoint reference:
            -- https://docs.oracle.com/en-us/iaas/api/#/en/language/20221001/
            IF v_region_code = 'eu-jovanovac-1' THEN
                v_second_level_domain := 'oraclecloud20.com';
            ELSE
                v_second_level_domain := 'oraclecloud.com';
            END IF;

            v_base_url := REGEXP_REPLACE(
                v_base_url, 
                '{region}',
                v_region_code);

            v_base_url := REGEXP_REPLACE(
                v_base_url, 
                '{secondLevelDomain}',
                v_second_level_domain);

            lang_data_logger_pkg.log_debug(
                'NER REST request header: '|| v_req_headers.to_clob()
            );
            lang_data_logger_pkg.log_debug(
                'NER REST request body: '|| v_req_body.to_clob()
            );

            -- Send the request
            v_full_resp := DBMS_CLOUD.send_request(
                credential_name => v_cred_name,
                uri             => v_base_url,
                method          => 'POST',
                headers         => v_req_headers.to_clob(),
                body            => v_req_body.to_blob
            );

            v_response_json_obj := 
                JSON_OBJECT_T(DBMS_CLOUD.GET_RESPONSE_TEXT(v_full_resp));
            v_doc_json_array := v_response_json_obj.get_array('documents');
            v_doc_json_obj := TREAT (
                v_doc_json_array.get(0) AS JSON_OBJECT_T
            );
            v_entities_array_response := v_doc_json_obj.get_array('entities');
            lang_data_logger_pkg.log_debug(
                'Document Key: ' || v_doc_json_obj.get_String('key'));
            lang_data_logger_pkg.log_debug(
                'Detected Language: ' || 
                v_doc_json_obj.get_string('languageCode'));

            -- Loop through entities
            FOR j IN 0 .. v_entities_array_response.get_size - 1 LOOP
                v_entity_response := TREAT(
                    v_entities_array_response.get(j) AS JSON_OBJECT_T
                );

                lang_data_logger_pkg.log_debug(
                    'Found Entity: '|| v_entity_response.get_string('text') 
                    ||CHR(9)||
                    ' | type: '|| v_entity_response.get_string('type') 
                    ||CHR(9)||
                    ' | subtype: '||v_entity_response.get_string('subType')
                );

                v_entity_json_obj := JSON_OBJECT_T();
                v_entity_json_obj.put(
                    'start', v_entity_response.get_number('offset'));
                v_entity_json_obj.put(
                    'end', 
                    v_entity_response.get_number('offset') 
                        + v_entity_response.get_number('length') - 1);
                v_entity_json_obj.put(
                    'text', v_entity_response.get_string('text'));

                -- Handle entity types which have sub-types
                IF v_entity_response.get_string('type') = 'DATETIME' THEN
                    -- Fallback sub-type label for DATETIME = DATE
                    IF v_entity_response.get_string('subType') = 'TIME' THEN
                        v_entity_json_obj.put('label', 'time');
                    ELSIF  v_entity_response.get_string('subType') = 'DURATION' 
                    THEN
                        v_entity_json_obj.put('label', 'duration');
                    ELSIF v_entity_response.get_string('subType') = 'INTERVAL' 
                    THEN
                        continue;
                    ELSE
                        v_entity_json_obj.put('label', 'date');
                    END IF;

                ELSIF v_entity_response.get_string('type') = 'LOCATION' THEN

                    -- Fallback sub-type label for LOCATION = GPE
                    IF v_entity_response.get_string('subType') = 'FACILITY' THEN
                        v_entity_json_obj.put('label', 'facility');
                    ELSE
                        v_entity_json_obj.put('label', 'gpe');
                    END IF;

                ELSIF v_entity_response.get_string('type') = 'QUANTITY' THEN

                    -- Fallback sub-type label for QUANTITY = number
                    IF v_entity_response.get_string('subType') = 'CURRENCY' THEN
                        v_entity_json_obj.put('label', 'currency');
                    ELSIF  
                        v_entity_response.get_string('subType') = 'PERCENTAGE' 
                        THEN
                        v_entity_json_obj.put('label', 'percentage');
                    ELSE
                        v_entity_json_obj.put('label', 'number');
                    END IF;

                ELSE
                    v_entity_json_obj.put(
                        'label', 
                        LOWER(v_entity_response.get_string('type')));    
                END IF;

                v_entities_arr.APPEND(v_entity_json_obj);
            END LOOP;

            v_entities_json := JSON(v_entities_arr.TO_STRING); 
        -- By default use NuNER
        ELSE
            v_pyq_json_object.put('query', REPLACE(p_query, '''', ''''''));
            v_pyq_json_object.put('labels', v_labels_lower);
            v_model_cache := lang_data_config_pkg.get_config_parameter(
                                'MOUNT_DIR'
                            );
            v_model_cache := v_model_cache || '/models';
            v_pyq_json_object.put(
                'cache_dir',
                v_model_cache
            );
            v_pyq_string := v_pyq_json_object.to_string();
            -- Call the registered OML4PY function to process entities
            SELECT JSON_ARRAYAGG(
                    JSON_OBJECT(
                        'start' VALUE "start",
                        'end' VALUE "end",
                        'text' VALUE "text",
                        'label' VALUE "label"
                    ) RETURNING JSON
                )
            INTO v_entities_json
            FROM TABLE(
                pyqEval(
                    v_pyq_string,
                    '{
                        "start": "number",
                        "end": "number",
                        "text": "varchar2(4000)",
                        "label": "varchar2(100)"
                    }',
                    'process_entities'
                )
            );
            END IF;
        RETURN v_entities_json;
    END;

    FUNCTION get_entities_from_text (
        p_text IN VARCHAR2
    ) RETURN JSON
    IS
        v_all_named_entities     SYS_REFCURSOR;
        v_named_entity_id        VARCHAR2(255);
        v_named_entity_name      VARCHAR2(36);
        v_all_labels_str         VARCHAR2(4000) := '';
        v_pyq_json_object        JSON_OBJECT_T := JSON_OBJECT_T();
        v_pyq_string             VARCHAR2(4000);
        v_entities               JSON;
    BEGIN
        -- Step 1: Gather all labels
        lang_data_named_entities_pkg.get_all_named_entities(
            v_all_named_entities
        );
        LOOP
            FETCH v_all_named_entities
            INTO v_named_entity_id, v_named_entity_name;
            EXIT WHEN v_all_named_entities%NOTFOUND;
            IF v_all_labels_str IS NOT NULL THEN
                v_all_labels_str := v_all_labels_str || ',' ||
                                    v_named_entity_name;
            ELSE
                v_all_labels_str := v_named_entity_name;
            END IF;
        END LOOP;
        CLOSE v_all_named_entities;

        IF TRIM(v_all_labels_str) IS NOT NULL THEN
            v_entities := lang_data_named_entities_pkg.ner_call(
                p_query => p_text,
                p_labels => v_all_labels_str
            );
        END IF;

        RETURN v_entities;
    END get_entities_from_text;

    
    -- NOTE: For every entity_type, there must be a filter description and an
    -- associated ID of the document to which the filter belongs, i.e., 
    --     length(p_entity_types) = length(p_filter_descriptions) and
    --     length(p_entity_types) = length(p_ids) = Number of Total NER Filters
    PROCEDURE get_entities_using_ner(
        p_query                             IN VARCHAR2,
        p_filter_descriptions               IN SYS.ODCIVARCHAR2LIST,
        p_entity_types                      IN SYS.ODCIVARCHAR2LIST,
        p_ids                               IN SYS.ODCIVARCHAR2LIST,
        p_results                           OUT SYS.ODCIVARCHAR2LIST
    ) IS
        v_entity_type                       VARCHAR2(255);
        v_labels_str                        VARCHAR2(32767);
        v_entities                          JSON;
        v_process_entities_results          JSON;
        v_existing_entities                 JSON;
        v_results                           SYS.ODCIVARCHAR2LIST := SYS.ODCIVARCHAR2LIST();
        v_entities_json                     JSON_OBJECT_T;
        v_entity_context                    VARCHAR2(32767);
        v_context_json                      JSON_OBJECT_T;
        v_entities_arr                      JSON_ARRAY_T;
        v_entities_with_context_arr         JSON_ARRAY_T;
        v_context_embeddings_clob           CLOB;
        v_context_embedding                 VECTOR;
        v_filter_description                VARCHAR(4000);
        v_filter_embedding                  VECTOR;
        v_prev_context_len                  NUMBER;
        v_post_context_len                  NUMBER;
        v_cur_vector                        VECTOR;
        v_cur_distance                      NUMBER;
        v_distances                         JSON;
        v_json_object                       JSON_OBJECT_T;
        v_min_distance                      NUMBER;
        v_best_index                        NUMBER;
        v_best_match                        VARCHAR2(4000);
        v_pyq_json_object                   JSON_OBJECT_T := JSON_OBJECT_T();
        v_pyq_string                        VARCHAR2(4000);
        v_filter_desc_str                   VARCHAR2(4000);
        v_entity_types_str                  VARCHAR2(4000);
        v_entity_text                       VARCHAR2(4000);
        v_entity_label                      VARCHAR2(100);
        -- Unique Key to the "cache" will be the concatenation of 
        -- Report/Drilldown ID(36) + Entity-Type/Label(255) +
        -- Index of the entity value in the entities JSON of a 
        -- langdata$nerstate row, in the string form(assuming to not be >9).
        v_unique_key                        VARCHAR2(300);
        TYPE entity_set IS TABLE OF BOOLEAN INDEX BY VARCHAR2(255);
        TYPE document_visited IS TABLE OF BOOLEAN INDEX BY VARCHAR2(36);
        TYPE cache_used IS TABLE OF BOOLEAN INDEX BY VARCHAR2(300);
        v_already_present entity_set;
        v_doc_visited document_visited;
        v_cache_used cache_used;
        v_flag BOOLEAN := false;
        v_json_clob CLOB;
    BEGIN
        v_prev_context_len := TO_NUMBER(
            lang_data_config_pkg.get_config_parameter(
                'LANG_DATA_PREV_CONTEXT_LEN'
            )
        );
        v_post_context_len := TO_NUMBER(
            lang_data_config_pkg.get_config_parameter(
                'LANG_DATA_POST_CONTEXT_LEN'
            )
        );

        -- De-duplicate the entity types before feeding it to the NER Call
        -- as this helps with latency costs. 
        -- Iterate through all the entity_types provided and:
        --      1. Build the labels string, to be passed in process_entities
        --      2. Set result[i] to NULL if entity_type is NULL
        FOR i in 1 .. p_entity_types.COUNT LOOP
            v_entity_type := p_entity_types(i);
            v_results.EXTEND;
            
            IF v_entity_type IS NULL THEN
                v_results(i) := NULL;
                CONTINUE;
            END IF;

            -- Check if entity type is already seen
            IF v_already_present.EXISTS(v_entity_type) THEN
                CONTINUE; -- Skip duplicates
            END IF;

            v_already_present(v_entity_type) := TRUE;

            IF v_labels_str IS NULL THEN
                v_labels_str := v_entity_type;
            ELSE
                v_labels_str := v_labels_str || ',' || v_entity_type;
            END IF;
        END LOOP;

        -- Get all the entities from the query text based on the entity_types,
        -- identified using NER (gliNER).
        v_process_entities_results := lang_data_named_entities_pkg.ner_call(
            p_query => p_query,
            p_labels => v_labels_str
        );

        -- Iterate through the entities identified in the query text and 
        -- populate the langdata$nerstate table:
        --     1. Build the Entity JSON Object (start, end, text, label)
        --     2. Build the Entity Context Embedding and store it in JSON Object
        --     3. a. If the entity type already exists in the table: 
        --              Merge the object built in steps 1 and 2 into the already
        --              existing row data.
        --        b. If the entity type does not exist already:
        --              Insert the JSON objects appropriately in the table.
        FOR rec IN (SELECT * FROM JSON_TABLE(
                                        v_process_entities_results, '$[*]'
                                        COLUMNS (
                                            "start" NUMBER PATH '$.start',
                                            "end" NUMBER PATH '$.end',
                                            "text" VARCHAR2(4000) PATH '$.text',
                                            "label" VARCHAR2(100) PATH '$.label'
                                        ))) LOOP

            -- Step 1: Prepare the Entities JSON Object
            v_entities_json := JSON_OBJECT_T();
            v_entities_json.PUT('start', rec."start");
            v_entities_json.PUT('end', rec."end");
            v_entities_json.PUT('text', rec."text");
            v_entities_json.PUT('label', rec."label");

            -- Step 2: Prepare the Context of the Entity
            SELECT lang_data_utils_pkg.match_substring_with_context(
                p_query,
                rec."text",
                v_prev_context_len,
                v_post_context_len
            )
            INTO v_entity_context
            FROM dual;

            -- Step 2: Fetch embedding for the context
            v_context_embedding := lang_data_utils_pkg.get_embedding(
                v_entity_context
            );
            
            v_context_json := JSON_OBJECT_T();
            v_context_json.PUT('text', v_entity_context);
            v_context_json.PUT(
                'embedding', VECTOR_SERIALIZE(v_context_embedding)
            );

            -- Step 3: Check if a row already exists for this entity_type
            BEGIN
                SELECT entities, context_embeddings
                INTO v_existing_entities, v_context_embeddings_clob
                FROM langdata$nerstate
                WHERE entity_type = rec."label";

                -- Parse existing JSON arrays
                v_entities_arr := JSON_ARRAY_T(
					JSON_SERIALIZE(v_existing_entities)
                );

                -- Check if the same entity is already present in the langdata$nerstate table
                -- from the get_all_entities_using_ner procedure()
                v_flag := false;
                FOR i in 0 .. v_entities_arr.get_size - 1 LOOP
                    v_json_object := TREAT(
                            v_entities_arr.get(
                                i
                            ) AS JSON_OBJECT_T
                    );
                    IF v_json_object.get_string('text') = rec."text" AND 
                        v_json_object.get_string('label') = rec."label" 
                    THEN
                        v_flag := true;
                        EXIT;
                    END IF;
                END LOOP;

                IF v_flag THEN
                    lang_data_logger_pkg.log_debug(
                        'The entity, ' || rec."text" || 
                        ' of type ' || rec."label" || 
                        ' is already present in the cache table.'
                    );
                    continue;
                END IF;

                v_entities_with_context_arr := JSON_ARRAY_T.parse(v_context_embeddings_clob);

                -- Append new entries
                v_entities_arr.append(v_entities_json);
                v_entities_with_context_arr.append(v_context_json);

                v_entities := JSON(v_entities_arr.to_string);
                v_json_clob := lang_data_utils_pkg.json_array_to_clob(
                    v_entities_with_context_arr
                );

                -- Step 3.a: Update the existing row
                UPDATE langdata$nerstate
                SET entities = v_entities,
                    context_embeddings = v_json_clob
                WHERE entity_type = rec."label";

            EXCEPTION
                WHEN NO_DATA_FOUND THEN
                    -- Step 3.b: Row doesn't exist, insert new one
                    v_entities_arr := JSON_ARRAY_T();
                    v_entities_with_context_arr := JSON_ARRAY_T();

                    v_entities_arr.append(v_entities_json);
                    v_entities_with_context_arr.append(v_context_json);

                    v_entities := JSON(v_entities_arr.to_string);
                    v_json_clob := lang_data_utils_pkg.json_array_to_clob(
                        v_entities_with_context_arr
                    );

                    INSERT INTO langdata$nerstate (
                        entity_type, entities, context_embeddings
                    )
                    VALUES (
                        rec."label",
                        v_entities,
                        v_json_clob
                    );
            END;
        END LOOP;

        -- Iterate over the provided list of entity types and populate the 
        -- respective entity values:
        --     1. If there is a row for the entity type in the langdata$nerstate
        --        table, then:
        --             a. Fetch the embedding of the associated filter.
        --             b. Compute the distance between the filter embedding(from 
        --                step a) and all the entity context embeddings.
        --             c. Re-initialize the variables specific to finding best 
        --                match to avoid their usage from previous iterations.
        --             d. Iterate through the identified entities fetched from
        --                the langdata$nerstate with the given entity type:
        --                     i. Check if the entity value has already been 
        --                        used, using the cache associative array.
        --                     ii. If yes, then skip the current entity and if,
        --                         not compare its corresponding distance (with 
        --                         the filter embedding) with the minimum 
        --                         distance.
        --                     iii. If the entity type is providing the minimum
        --                          distance, then update the respective  
        --                          variables and store its index(of entities 
        --                          column of the identified row from 
        --                          langdata$nerstate).
        --             e. Update the cache (v_cache_used) to store the
        --                information, that this best match entity has been used
        --                for this particular report or drilldown ID at the  
        --                index identified in the iteration above.
        --             f. Store the best match in the Results list.
        --     2. If not, assign the result of this entity type as NULL.
        FOR i in 1 .. p_entity_types.COUNT 
        LOOP
            v_entity_type := p_entity_types(i);
            v_filter_description := p_filter_descriptions(i);

            -- If entity type is NULL, identified value is NULL.
            -- (not possible through search, but just to add a safeguard)
            IF v_entity_type IS NULL THEN
                v_results(i) := NULL;
                continue;
            END IF;

            BEGIN
                -- Check if the langdata$nerstate table has data for entity type
                SELECT entities, context_embeddings
                INTO v_entities, v_context_embeddings_clob
                FROM langdata$nerstate
                WHERE entity_type = v_entity_type;

                -- Step 1.a: Calculate embedding for filter description
                v_filter_embedding := lang_data_utils_pkg.get_embedding(
                    v_filter_description
                );

                -- Step 1.c: Reinitialize the variables to avoid previous values
                v_entities_arr := JSON_ARRAY_T(json_serialize(v_entities));
                v_entities_with_context_arr := JSON_ARRAY_T.parse(v_context_embeddings_clob);
                
                
                v_min_distance := 1e10; 
                v_best_match := NULL;
                v_best_index := -1;

                -- Step 1.d: Iterate through the identified entities
                FOR idx in 0 .. v_entities_arr.get_size - 1 LOOP
                    v_json_object := TREAT(
                            v_entities_with_context_arr.get(
                                idx
                            ) AS JSON_OBJECT_T
                    );
                    v_cur_vector := TO_VECTOR(
                        REPLACE(JSON_SERIALIZE(
                            v_json_object.get('embedding').to_json()
                        ), '"', '')
                    );
                    SELECT VECTOR_DISTANCE(v_filter_embedding, v_cur_vector)
                    INTO v_cur_distance
                    FROM dual;

                    v_json_object := TREAT(
                            v_entities_arr.get(
                                idx
                            ) AS JSON_OBJECT_T
                    );
                    v_entity_text := v_json_object.get_string('text');
                    v_entity_label := v_json_object.get_string('label');

                    -- Unique key for cache for this entity, for this document.
                    v_unique_key := p_ids(i) || v_entity_label || idx;

                    IF ((NOT v_cache_used.EXISTS(v_unique_key)) OR
                     (NOT v_cache_used(v_unique_key))) AND
                     v_cur_distance < v_min_distance THEN
                        v_min_distance := v_cur_distance;
                        v_best_match := v_entity_text;
                        v_best_index := idx;
                    END IF;
                END LOOP;

                -- Unique key for cache for the best match
                v_unique_key := p_ids(i) || v_entity_label || v_best_index;
                v_cache_used(v_unique_key) := TRUE;
                -- Return the best match
                v_results(i) := v_best_match;
            EXCEPTION
                WHEN NO_DATA_FOUND THEN
                    v_results(i) := NULL;
            END;
        END LOOP;

        p_results := v_results;
    END get_entities_using_ner;

    PROCEDURE get_all_entities_using_ner (
        p_query              IN VARCHAR2,
        p_labels             IN VARCHAR2,
        p_num_words_before   IN NUMBER DEFAULT 5,
        p_num_words_after    IN NUMBER DEFAULT 2,
        p_result             OUT JSON
    ) AS
        v_entities                      JSON;
        v_entities_arr                  JSON_ARRAY_T := JSON_ARRAY_T();
        v_entities_with_context_arr     JSON_ARRAY_T := JSON_ARRAY_T();
        v_context_embeddings_clob       CLOB;
        v_json_object                   JSON_OBJECT_T;
        v_json_object2                  JSON_OBJECT_T;
        v_entity_json                   VARCHAR2(4000);
        v_filter_embedding              VECTOR;
        v_pyq_json_object               JSON_OBJECT_T := JSON_OBJECT_T();
        v_pyq_string                    VARCHAR2(4000);
        v_array_as_text                 CLOB := '[';
    BEGIN
        lang_data_logger_pkg.log_info('Starting get_all_entities_using_ner');
        IF (p_query IS NULL) OR (p_labels IS NULL) THEN
            lang_data_logger_pkg.log_info('NULL query or labels');
            p_result := JSON('[]');
            RETURN;
        END IF;
        
        v_entities := lang_data_named_entities_pkg.ner_call(
            p_query => p_query,
            p_labels => p_labels
        );

        p_result := v_entities;

        -- Iterate through entities and generate context and embeddings
        FOR rec IN (SELECT * FROM JSON_TABLE(v_entities, '$[*]'
                            COLUMNS (
                                "start" NUMBER PATH '$.start',
                                "end" NUMBER PATH '$.end',
                                "text" VARCHAR2(4000) PATH '$.text',
                                "label" VARCHAR2(100) PATH '$.label'
                            ))) LOOP

            lang_data_logger_pkg.log_info(
                'Processing entity: ' || rec."text" || 
                ' with label: ' || rec."label"
            );

            -- Generate context for each entity
            SELECT lang_data_utils_pkg.match_substring_with_context(
                p_query,
                rec."text",
                p_num_words_before,
                p_num_words_after
            )
            INTO v_entity_json
            FROM dual;

            lang_data_logger_pkg.log_info(
                'Generated context: ' || v_entity_json
            );

            v_filter_embedding := lang_data_utils_pkg.get_embedding(
                v_entity_json
            );

            lang_data_logger_pkg.log_info('Fetched embedding for context');

            -- Create JSON object for entity with context
            v_json_object := JSON_OBJECT_T();
            v_json_object.PUT('text', v_entity_json);
            v_json_object.PUT(
                'embedding', VECTOR_SERIALIZE(v_filter_embedding)
            );

            v_json_object2 := JSON_OBJECT_T();
            v_json_object2.PUT('start', rec."start");
            v_json_object2.PUT('end', rec."end");
            v_json_object2.PUT('text', rec."text");
            v_json_object2.PUT('label', rec."label");

            -- Check if label exists in state, otherwise initialize
            BEGIN
                SELECT entities, context_embeddings
                INTO v_entities, v_context_embeddings_clob
                FROM langdata$nerstate
                WHERE entity_type = rec."label";

                -- Parse existing entities and entities_with_context into arrays
                v_entities_arr := JSON_ARRAY_T(v_entities);
                v_entities_with_context_arr := JSON_ARRAY_T.parse(
                    v_context_embeddings_clob
                );

                -- Append the new entity and entity with context
                v_entities_arr.APPEND(v_json_object2);
                v_entities_with_context_arr.APPEND(v_json_object);

                -- Convert back to JSON
                v_entities := JSON(v_entities_arr.TO_STRING);
                v_array_as_text := lang_data_utils_pkg.json_array_to_clob(
                    v_entities_with_context_arr
                );

                -- Update the state table
                UPDATE langdata$nerstate
                SET entities = v_entities,
                    context_embeddings = v_array_as_text
                WHERE entity_type = rec."label";

                lang_data_logger_pkg.log_info(
                    'Updated langdata$nerstate for label: ' || rec."label"
                );
            EXCEPTION
                WHEN NO_DATA_FOUND THEN
                    -- Insert new label into state
                    v_entities_arr := JSON_ARRAY_T();
                    v_entities_with_context_arr := JSON_ARRAY_T();

                    v_entities_arr.APPEND(v_json_object2);
                    v_entities_with_context_arr.APPEND(v_json_object);
                    v_array_as_text := lang_data_utils_pkg.json_array_to_clob(
                        v_entities_with_context_arr
                    );

                    -- Convert to JSON
                    v_entities := JSON(v_entities_arr.TO_STRING);

                    INSERT INTO langdata$nerstate (
                        entity_type, entities, context_embeddings
                    )
                    VALUES (
                        rec."label", 
                        v_entities,
                        v_array_as_text
                    );

                    lang_data_logger_pkg.log_info(
                        'Inserted into langdata$nerstate for label: ' || rec."label"
                    );
            END;
        END LOOP;

        lang_data_logger_pkg.log_info('Completed get_all_entities_using_ner');
    END get_all_entities_using_ner;
    
END lang_data_named_entities_pkg;
/


