Rem
Rem $Header: dbgendev/src/langdata/plsql/llm/llm_pkg.pkb /main/6 2025/08/13 01:29:21 jiangnhu Exp $
Rem
Rem llm_pkg.pkb
Rem
Rem Copyright (c) 2025, Oracle and/or its affiliates.
Rem
Rem    NAME
Rem      llm_pkg.pkb - Langdata LLM Package
Rem
Rem    DESCRIPTION
Rem      This package handles all the interactions with the llms.
Rem
Rem    NOTES
Rem      None
Rem
Rem    BEGIN SQL_FILE_METADATA
Rem    SQL_SOURCE_FILE: dbgendev/src/langdata/plsql/llm/llm_pkg.pkb
Rem    SQL_SHIPPED_FILE:
Rem    SQL_PHASE:
Rem    SQL_STARTUP_MODE: NORMAL
Rem    SQL_IGNORABLE_ERRORS: NONE
Rem    END SQL_FILE_METADATA
Rem
Rem    MODIFIED   (MM/DD/YY)
Rem    jiangnhu    07/23/25 - DBAI-764: Implement
Rem                           rerank_filter_value_candidates
Rem    pryarla     07/21/25 - DBAI-1075: Generate sample queries using LLM
Rem    pryarla     05/15/25 - DBAI-737: Created LLM Package
Rem

CREATE OR REPLACE PACKAGE BODY lang_data_llm_pkg IS

    FUNCTION validate_llm_profile(
        p_profile_name IN VARCHAR2
    ) RETURN BOOLEAN IS
        v_llm_result CLOB;
    BEGIN
        BEGIN
            v_llm_result := DBMS_CLOUD_AI.GENERATE(
                prompt       => 'Say Hello',
                profile_name => p_profile_name,
                action       => 'chat'
            );

            -- Log only the first 1000 characters of the response
            lang_data_logger_pkg.log_debug(
                'LLM Response for "Say Hello": ' || 
                DBMS_LOB.SUBSTR(v_llm_result, LEAST(DBMS_LOB.GETLENGTH(v_llm_result), 1000), 1)
            );
            lang_data_logger_pkg.log_info('Successfully called LLM using "' || p_profile_name || '" profile.');

            RETURN TRUE;

        EXCEPTION
            WHEN OTHERS THEN
                lang_data_errors_pkg.raise_error(
                    lang_data_errors_pkg.c_invalid_dbms_cloud_ai_profile
                );
                RAISE;
        END;
    END validate_llm_profile;

    FUNCTION query_llm(
        p_prompt IN CLOB
    ) RETURN CLOB IS
        v_llm_result CLOB;
        v_profile_name VARCHAR2(125);
    BEGIN
        
        v_profile_name := lang_data_config_pkg.get_config_parameter(
            'LANG_DATA_DBMS_CLOUD_AI_PROFILE_NAME'
        );

        v_llm_result := DBMS_CLOUD_AI.GENERATE(
            prompt       => p_prompt,
            profile_name => v_profile_name,
            action       => 'chat'
        );
        
        RETURN v_llm_result;
    END query_llm;

    FUNCTION detect_named_entities(
        p_query  IN VARCHAR2,
        p_labels IN VARCHAR2
    ) RETURN JSON
    IS
        v_prompt          VARCHAR2(4000);
        v_llm_response    CLOB;
        v_clean_response  CLOB;
        v_doc_result      JSON_OBJECT_T;
        v_entities_arr    JSON_ARRAY_T := JSON_ARRAY_T();
        v_entity          JSON_OBJECT_T;
        v_entity_item     JSON_OBJECT_T;
        v_entity_element  JSON_ELEMENT_T;
        v_entity_array    JSON_ARRAY_T;
    BEGIN
        lang_data_logger_pkg.log_info('Using LLM to detect the following entities: "' || p_labels || '" in query: ' || p_query);
        
        -- Step 1: Build strict JSON-based prompt
        v_prompt := 'Extract named entities from the given text strictly in the following JSON format:' || CHR(10) ||
                    '{ "entities": [ { "offset": (integer), "length": (integer), "text": (string), "label": (string) }, ... ] }' || CHR(10) ||
                    'The offset should be the start character position (0-indexed) of the entity in the text.' || CHR(10) ||
                    'Text: "' || p_query || '"' || CHR(10) ||
                    'Labels to use: ' || p_labels || '.' || CHR(10) ||
                    'Return ONLY the JSON object. No explanations.';

        -- Step 2: Call the LLM
        v_llm_response := query_llm(v_prompt);

        lang_data_logger_pkg.log_debug('Raw LLM Response Length: ' || 
            DBMS_LOB.GETLENGTH(v_llm_response));

        -- Step 3: Clean the LLM response
        v_clean_response := REPLACE(v_llm_response, '```json', '');
        v_clean_response := REPLACE(v_clean_response, '```', '');
        v_clean_response := TRIM(v_clean_response);

        -- Step 4: Parse response into JSON
        v_doc_result := JSON_OBJECT_T.parse(v_clean_response);

        -- Step 5: Extract and transform entities
        v_entity_array := TREAT(v_doc_result.get('entities') AS JSON_ARRAY_T);

        FOR j IN 0 .. v_entity_array.get_size - 1 LOOP
            v_entity_element := v_entity_array.get(j);
            v_entity_item := TREAT(v_entity_element AS JSON_OBJECT_T);

            v_entity := JSON_OBJECT_T();
            v_entity.put('start', v_entity_item.get_Number('offset'));
            v_entity.put('end', v_entity_item.get_Number('offset') + 
                v_entity_item.get_Number('length') - 1);
            v_entity.put('text', v_entity_item.get_String('text'));
            v_entity.put('label', v_entity_item.get_String('label'));

            v_entities_arr.append(v_entity);
        END LOOP;

        -- Step 6: Return final JSON
        RETURN JSON(v_entities_arr.to_string);

    EXCEPTION
        WHEN OTHERS THEN
            lang_data_logger_pkg.log_error('Unexpected error in 
                detect_named_entities: ' || SQLERRM);
            RAISE;
    END detect_named_entities;

    PROCEDURE rerank_filter_value_candidates (
        p_query              IN VARCHAR2,
        p_filter_candidates  IN JSON_ARRAY_T,
        p_top_candidate      OUT VARCHAR2,
        p_top_score          OUT NUMBER,
        p_top_index          OUT INTEGER
    ) IS
        v_prompt         VARCHAR2(4000);
        v_response_raw   CLOB;
        v_response_clean CLOB;
        v_response_json  JSON_OBJECT_T;
        v_scores_array   JSON_ARRAY_T;
        v_obj            JSON_OBJECT_T;
        v_score          NUMBER;
        v_candidate      VARCHAR2(4000);
        v_max_score      NUMBER := -1;
        v_best_index     PLS_INTEGER := -1;
        v_best_candidate VARCHAR2(4000);
        TYPE    candidate_set IS TABLE OF BOOLEAN INDEX BY VARCHAR2(4000);
        v_seen  candidate_set;
    BEGIN
        lang_data_logger_pkg.log_info(
            'Using LLM to rerank the following filter value candidates: "' ||
            p_filter_candidates.to_string || '" for query: ' || p_query
        );

        -- Step 1: Build strict JSON-based prompt
        v_prompt := 
            '### SYSTEM PROMPT' || CHR(10) ||
            'Your task is to semantically SCORE the given QUERY_STRING against a list of CANDIDATE_STRINGS.' || CHR(10) ||
            'Return a single JSON object with a key "scores", whose value is an array of objects.' || CHR(10) ||
            'Each object must include:' || CHR(10) ||
            '  - "score": float between 0 and 1' || CHR(10) ||
            '  - "candidate_str": the candidate string used' || CHR(10) ||
            'IMPORTANT: You MUST include exactly one object for each input candidate.' || CHR(10) ||
            'You MAY return them in any order.' || CHR(10) ||
            '---' || CHR(10) ||
            'QUERY_STRING: "' || p_query || '"' || CHR(10) ||
            'CANDIDATE_STRINGS: ' || p_filter_candidates.to_string || CHR(10) ||
            '---' || CHR(10) ||
            '### OUTPUT_FORMAT' || CHR(10) ||
            'Return a single valid JSON object, like:' || CHR(10) ||
            '{ "scores": [ { "score": 0.91, "candidate_str": "Balance Inquiry" }, ... ] }' || CHR(10) ||
            'Do NOT include any explanatory text, markdown syntax, headings, or formatting.';

        -- Step 2: Call the LLM
        v_response_raw := query_llm(v_prompt);

        lang_data_logger_pkg.log_debug(
            'Raw LLM Response Length: ' || DBMS_LOB.GETLENGTH(v_response_raw)
        );

        -- Step 3: Clean the LLM response
        v_response_clean := REPLACE(v_response_raw, '```json', '');
        v_response_clean := REPLACE(v_response_clean, '```', '');
        v_response_clean := TRIM(v_response_clean);

        -- Step 4: Parse response into JSON
        v_response_json := JSON_OBJECT_T.parse(v_response_clean);
        v_scores_array := TREAT(v_response_json.get('scores') AS JSON_ARRAY_T);

        FOR i IN 0 .. v_scores_array.get_size - 1 LOOP
            v_candidate := 
                TREAT(
                    v_scores_array.get(i) AS JSON_OBJECT_T
                ).get_string('candidate_str');
            v_seen(v_candidate) := TRUE;
        END LOOP;

        -- Step 5: Assert LLM returned same number of candidates
        IF v_scores_array.get_size != p_filter_candidates.get_size THEN
            lang_data_logger_pkg.log_error(
                'LLM output length does not match candidate set.'
            );
            lang_data_errors_pkg.raise_error(
                lang_data_errors_pkg.c_invalid_llm_response
            );
        END IF;

        -- Step 6: Assert all original candidates are covered
        FOR i IN 0 .. p_filter_candidates.get_size - 1 LOOP
            IF NOT v_seen.EXISTS(p_filter_candidates.get_string(i)) THEN
                lang_data_logger_pkg.log_error(
                    'Missing candidate_str in LLM output: ' ||
                    p_filter_candidates.get_string(i)
                );
                lang_data_errors_pkg.raise_error(
                    lang_data_errors_pkg.c_invalid_llm_response
                );
            END IF;
        END LOOP;

        -- Find the candidate with highest score
        FOR i IN 0 .. v_scores_array.get_size - 1 LOOP
            v_obj := TREAT(v_scores_array.get(i) AS JSON_OBJECT_T);
            v_score := v_obj.get_number('score');
            v_candidate := v_obj.get_string('candidate_str');
            IF v_score > v_max_score THEN
                v_max_score := v_score;
                v_best_candidate := v_candidate;
            END IF;
        END LOOP;

        FOR i IN 0 .. p_filter_candidates.get_size - 1 LOOP
            IF p_filter_candidates.get_string(i) = v_best_candidate THEN
                v_best_index := i;
                EXIT;
            END IF;
        END LOOP;

        p_top_candidate := v_best_candidate;
        p_top_score := v_max_score;
        p_top_index := v_best_index + 1;

    EXCEPTION
        WHEN OTHERS THEN
            lang_data_logger_pkg.log_error(
                'Error in rerank_filter_value_candidates: ' || SQLERRM
            );
            RAISE;
    END rerank_filter_value_candidates;

    FUNCTION get_prompt_additional_info (
        p_sample_queries IN SYS_REFCURSOR
    ) RETURN CLOB
    IS
        v_id         VARCHAR2(36);
        v_query      VARCHAR2(2000);
        v_version    NUMBER;
        v_report_id  VARCHAR2(36);
        v_drilldown_id  VARCHAR2(36);
        v_enhanced_text VARCHAR2(4000);
        v_status        VARCHAR2(20);
        v_first      BOOLEAN := TRUE;
        v_result     CLOB;
        v_prompt     CLOB;
        v_line_number     NUMBER := 1;
        v_sample_queries_clob CLOB := '';
    begin
        LOOP
            FETCH p_sample_queries INTO v_id, v_query, v_version, v_report_id, v_drilldown_id, v_enhanced_text, v_status;
            EXIT WHEN p_sample_queries%NOTFOUND;

            v_sample_queries_clob := v_sample_queries_clob || TO_CHAR(v_line_number) || '. ' || v_query || CHR(10);
            v_line_number := v_line_number + 1;
        END LOOP;

        IF v_line_number = 1 THEN
            return '';
        END IF;

        v_prompt := 
            '---' || CHR(10) ||
            '#### ALREADY EXISTING QUERIES' || CHR(10) ||
            'The following user queries have already been generated previously. **Do not repeat or paraphrase** these in your output.' || CHR(10) ||
            v_sample_queries_clob || '```' || CHR(10);
        return v_prompt;
    END get_prompt_additional_info;

    FUNCTION generate_sample_queries (
        p_title              IN VARCHAR2,
        p_description        IN VARCHAR2,
        p_filter_information IN VARCHAR2,
        p_sample_queries     IN SYS_REFCURSOR DEFAULT NULL,
        p_count              IN NUMBER DEFAULT 10
    ) RETURN CLOB IS
        v_prompt     CLOB;
        v_response   CLOB;
    BEGIN
        v_prompt := 
            '### SYSTEM PROMPT' || CHR(10) ||
            'Generate **exactly ' || TO_CHAR(p_count) || ' distinct user queries** that could be answered by the supplied REPORT.' || CHR(10) ||
            '---' || CHR(10) ||
            '#### REPORT COMPONENTS' || CHR(10) ||
            '1. **TITLE**' || CHR(10) ||
            '2. **DESCRIPTION** – explains what the report covers and the kinds of questions it can answer.' || CHR(10) ||
            '3. **FILTER_INFORMATION** – metadata about filters available in the report.' || CHR(10) ||
            '---' || CHR(10) ||
            '#### FILTER_INFORMATION DETAILS' || CHR(10) ||
            '- Each filter includes a **filter_name** and **default_value**.' || CHR(10) ||
            '- **Enumerable filters** specify a set of possible values. User queries may reference these values using synonyms or variations, not necessarily exact matches.' || CHR(10) ||
            '- **NER filters** specify an **entity_type** whose value will be extracted from the query by an NER model.' || CHR(10) ||
            '---' || CHR(10) ||
            '#### EXAMPLE' || CHR(10) ||
            '```\n' ||
            'TITLE: Product estate planning' || CHR(10) ||
            'DESCRIPTION: This report provides a comprehensive view of a customer''s estate-planning activities with Fidelity, including wills, trusts, beneficiary designations, and estate-tax strategies.' || CHR(10) ||
            'FILTER_INFORMATION: [' || CHR(10) ||
            '  {' || CHR(10) ||
            '    "filter_name": "Beneficiary",' || CHR(10) ||
            '    "use_ner": false,' || CHR(10) ||
            '    "description": "Name of beneficiaries",' || CHR(10) ||
            '    "enumerable_set": [' || CHR(10) ||
            '      "Bob","Frank","Kim","Don","Anne","Jill","Goodluck","Patience","Jonathan"' || CHR(10) ||
            '    ],' || CHR(10) ||
            '    "default_value": "Kim"' || CHR(10) ||
            '  }' || CHR(10) ||
            ']' || CHR(10) ||
            '```' || CHR(10) ||
            '---' || CHR(10) ||
            '#### INPUT REPORT' || CHR(10) ||
            '```\n' ||
            'TITLE: ' || p_title || CHR(10) ||
            'DESCRIPTION: ' || p_description || CHR(10) ||
            'FILTER_INFORMATION: ' || p_filter_information || CHR(10);

        IF p_sample_queries IS NOT NULL THEN
            v_prompt := v_prompt || get_prompt_additional_info(p_sample_queries);
        END IF;

        v_prompt := v_prompt ||
            '```' || CHR(10) ||
            '---' || CHR(10) ||
            '#### OUTPUT FORMAT' || CHR(10) ||
            'Return a single JSON array containing **exactly 50** user-query strings and **nothing else** (no explanations, headings, or extra text).';
        -- Call LLM function
        v_response := query_llm(p_prompt => v_prompt);

        lang_data_logger_pkg.log_debug('Raw LLM Response Length: ' || 
            DBMS_LOB.GETLENGTH(v_response));

        -- Step 3: Clean the LLM response
        v_response := REPLACE(v_response, '```json', '');
        v_response := REPLACE(v_response, '```', '');
        v_response := TRIM(v_response);

        RETURN v_response;
    END generate_sample_queries;

END lang_data_llm_pkg;
/
