Rem
Rem $Header: dbgendev/src/langdata/plsql/sql/create_tables.sql /main/41 2025/08/17 19:34:27 deveverm Exp $
Rem
Rem create_tables.sql
Rem
Rem Copyright (c) 2024, 2025, Oracle and/or its affiliates.
Rem
Rem    NAME
Rem      create_tables.sql - Create the tables in the database
Rem
Rem    DESCRIPTION
Rem      This script creates the tables, along with their indices, required
Rem	 for Lang-Data in the database.
Rem
Rem    NOTES
Rem      <other useful comments, qualifications, etc.>
Rem
Rem    BEGIN SQL_FILE_METADATA
Rem    SQL_SOURCE_FILE: dbgendev/src/langdata/backend/sql/create_tables.sql
Rem    SQL_SHIPPED_FILE:
Rem    SQL_PHASE:
Rem    SQL_STARTUP_MODE: NORMAL
Rem    SQL_IGNORABLE_ERRORS: NONE
Rem    END SQL_FILE_METADATA
Rem
Rem    MODIFIED   (MM/DD/YY)
Rem    deveverm    08/08/25 - Removed min,max,vector_dims, gamma
Rem                           columns from domain table
Rem    dadoshi     08/04/25 - JIRA_DBAI1147: Add min/max distance, vector
Rem                           dimension, and gamma to langdata
Rem    jiangnhu    07/30/25 - DBAI-1163: Update table langdata$apex_filters to
Rem                           add column enumerable set, description
Rem    deveverm    07/28/25 - DBAI-1011: changed re_ranking_inputs to global
Rem                           temporary table
Rem    ruohli      07/24/25 - DBAI-1076: Added created_at and updated_at within
Rem                           within langdata$domains
Rem    deveverm    07/15/25 - DBAI-1050: added langdata$apex_filters
Rem    sathyavc    07/07/25 - DBAI-883: Create api_logs table. Remove 
Rem                           api_metrics table
Rem    jiangnhu    07/07/25 - DBAI-871: Add version in langdata$samplequeries
Rem    sathyavc    06/30/25 - DBAI-881: Map search record to corresponding
Rem                           langdata$question_stats.question_id
Rem    jiangnhu    06/24/25 - DBAI-909: Add domain_id in
Rem                           langdata$value_vector_partition_descriptions
Rem    saloshah    06/18/25 - DBAI-745-757: Added ner metrics table
Rem    arevathi    06/03/25 - Added langdata$searchrecords_archive table
Rem    jiangnhu    06/02/25 - DBAI-844: Merge langdata$value_vector_metadata,
Rem                           langdata$annotations_catalog and
Rem                           langdata$comments_catalog
Rem    jiangnhu    05/28/25 - DBAI-828: Create vector index on langdata
Rem                           tables vector column only if exceed enumeration
Rem                           limit 10k
Rem    jiangnhu    05/21/25 - DBAI-767: Add langdata$drilldownvalues,
Rem                           langdata$value_vector_partition_descriptions
Rem    saloshah    05/19/25 - DBAI-746: Add langdata$api_metrics
Rem    deveverm    05/16/25 - DBAI-761: added status to langdata$samplequeries
Rem    dadoshi     05/15/25 - JIRA_DBAI804: Update langdata to store context
Rem                           embeddings as CLOB instead of JSON
Rem    jiangnhu    05/05/25 - DBAI-755: Add langdata$comments_catalog
Rem    anisbans    05/01/25 - DBAI-744 :  Added the variables top1_count,
Rem                           top3_count and top5_count to langdata$reports and
Rem                           langdata$drilldowndocuments
Rem    anisbans    04/24/25 - DBAI-751: added langdata$question_stats table
Rem    jiangnhu    04/24/25 - DBAI-662: Extend the logic of filters: support
Rem                           table column in another PDB or database
Rem    jiangnhu    04/17/25 - DBAI-739: Add langdata$plot_input
Rem    dadoshi     04/24/25 - JIRA_DBAI525: Added langdata$re_ranking_inputs
Rem    deveverm    04/10/25 - DBAI-721: Increased varible_value length
Rem    jiangnhu    04/09/25 - DBAI-731: Remove table langdata$user_ctx_indexes,
Rem                           add schema_name to langdata$value_vector_metadata
Rem    deveverm    04/01/25 - DBAI-523: added regression_json
Rem    anisbans    03/24/25 - DBAI-518: Update table names
Rem    jiangnhu    03/19/25 - DBAI-543: Better naming conventions for
Rem                           augmentation/amending
Rem    anisbans    03/13/25 - DBAI-518: Change name of tables
Rem    deveverm    03/11/25 - DBAI-546: added schema_name for cross_schema
Rem                           support
Rem    anisbans    03/11/25 - DBAI-556 : Add ref_count to langdata$user_ctx_indexes
Rem    jiangnhu    03/07/25 - Drop config table if exists
Rem    anisbans    03/04/25 - Add value_vector_metadata table
Rem    dadoshi     03/04/25 - Add config table
Rem    jiangnhu    02/28/25 - Set amended_text, augmented_query_text to
Rem                           varchar2(4000);
Rem    deveverm    02/27/25 - DBAI_598: added drop table from temp tables
Rem    jiangnhu    02/13/25 - DBAI-555: Add langdata$user_ctx_indexes
Rem    jiangnhu    01/31/25 - DBAI-511: Add augmented_tokens field
Rem    jiangnhu    01/29/25 - DBAI-505: Implement centroid version of
Rem                           search_from_query procedure
Rem    jiangnhu    01/23/25 - Add the creation of global temporary tables
Rem    dadoshi     12/09/24 - Drop langdata table if exists
Rem    jiangnhu    10/30/24 - Remove lookup table
Rem    dadoshi     10/21/24 - Remove text wrapping
Rem    dadoshi     10/18/24 - JIRA_DBAI-399: Update template
Rem    pryarla     10/16/24 - Created
Rem
drop table if exists langdata$apex_filters;
drop table if exists langdata$ner_metrics;
drop table if exists langdata$value_vector_partition_descriptions;
drop table if exists langdata$drilldownvalues;
drop table if exists langdata$plot_input; 
drop table if exists langdata$re_ranking_inputs; 
drop table if exists langdata$value_vector_metadata; 
drop table if exists langdata$nerstate;
drop table if exists langdata$stoplist;
drop table if exists langdata$filtervaluestemp;
drop table if exists langdata$config;
drop table if exists langdata$user_ctx_indexes;
drop table if exists langdata$jobs_history;
drop table if exists langdata$reportquerycluster;
drop table if exists langdata$drilldownquerycluster;
drop table if exists langdata$named_entities;
drop table if exists langdata$samplequeries;
drop table if exists langdata$schemaversion;
drop table if exists langdata$searchrecords_archive;
drop table if exists langdata$searchrecords;
drop table if exists langdata$drilldowndescriptions;
drop table if exists langdata$drilldowndocuments;
drop table if exists langdata$reportdescriptions;
drop table if exists langdata$reports;
drop table if exists langdata$domains;
drop table if exists langdata$api_logs;
drop table if exists langdata$question_stats;

create table langdata$domains(
    domain_id           varchar2(36) primary key,
    name                varchar2(255) not null unique,
    created_at          timestamp default current_timestamp,
    updated_at          timestamp default current_timestamp
);

-- TODO: Create Triggers for updated_at and also create indices for vectors
create table langdata$reports (
    id             varchar2(36) primary key,
    title          varchar2(255) not null unique,
    match_document json not null,
    status         varchar2(20) default 'Pending Review' 
                    check ( status in ( 'Pending Review',
                                        'Approved',
                                        'Rejected',
                                        'Published',
                                        'Inactive',
                                        'Archived',
                                        'Pending Regression' ) ),
    domain_id      varchar2(36),
    created_at     timestamp default current_timestamp,
    updated_at     timestamp default current_timestamp,
    analytics_data JSON DEFAULT JSON_OBJECT(),
    constraint fk_report_domain foreign key (domain_id) 
        references langdata$domains(domain_id) 
            on delete set null
);
create index idx_langdata_reports_created_at on langdata$reports (created_at);


create table langdata$reportdescriptions (
    id                 varchar2(36) primary key,
    report_id          varchar2(36) not null,
    text               varchar2(2000) not null,
    version            number,
    status             varchar2(20) default 'Pending Review' 
                        check ( status in ( 'Pending Review',
                                            'Approved',
                                            'Rejected',
                                            'Published',
                                            'Inactive',
                                            'Archived',
                                            'Pending Regression') ),
    regression_json    JSON DEFAULT NULL,
    created_at         timestamp default current_timestamp,
    updated_at         timestamp default current_timestamp,
    description_vector vector(*,*),
    description_md5    varchar2(32) not null,
    enhanced_text       varchar2(4000) not null,
    augmented_tokens   json,
    foreign key ( report_id )
        references langdata$reports ( id )
            on delete cascade,
    unique ( report_id,
             version )
);

create index idx_langdata_reportdescriptions_created_at 
 on langdata$reportdescriptions (created_at);

create table langdata$drilldowndocuments (
    id             varchar2(36) primary key,
    title          varchar2(255) not null,
    report_id      varchar2(36) not null,
    match_document json not null,
    status         varchar2(20) default 'Pending Review' 
                    check ( status in ( 'Pending Review',
                                        'Approved',
                                        'Rejected',
                                        'Published',
                                        'Inactive',
                                        'Archived' ) ),
    domain_id      varchar2(36),
    created_at     timestamp default current_timestamp,
    updated_at     timestamp default current_timestamp,
    analytics_data JSON DEFAULT JSON_OBJECT(),
    constraint fk_drilldown_domain foreign key (domain_id) 
        references langdata$domains(domain_id) 
            on delete set null,
    foreign key ( report_id )
        references langdata$reports ( id )
            on delete cascade,
    unique ( report_id,
             title )
);
create index idx_langdata_drilldowndocuments_created_at 
on langdata$drilldowndocuments (created_at);

create table langdata$drilldowndescriptions (
    id           varchar2(36) primary key,
    drilldown_id varchar2(36) not null,
    text         varchar2(2000) not null,
    version      number,
    status       varchar2(20) default 'Pending Review' 
                    check ( status in ( 'Pending Review',
                                        'Approved',
                                        'Rejected',
                                        'Published',
                                        'Inactive',
                                        'Archived',
                                        'Pending Regression') ),
    regression_json     JSON DEFAULT NULL,
    created_at   timestamp default current_timestamp,
    updated_at   timestamp default current_timestamp,
    ddd_vector   vector(*,*),
    ddd_md5      varchar2(32) not null,
    enhanced_text varchar2(4000) not null,
    augmented_tokens   json,
    foreign key ( drilldown_id )
        references langdata$drilldowndocuments ( id )
            on delete cascade,
    unique ( drilldown_id,
             version )
);
create index idx_langdata_drilldownescriptions_created_at 
on langdata$drilldowndescriptions (created_at);

-- This table is to store statistics related to the questions asked.
-- It tracks the frequency of questions asked within the system.
CREATE TABLE langdata$question_stats (
    question_id     VARCHAR2(36) PRIMARY KEY,
    question_text   VARCHAR2(4000),
    question_vector vector(*,*),
    asked_count     NUMBER DEFAULT 1,
    last_asked_at   TIMESTAMP DEFAULT SYSTIMESTAMP
);

create table langdata$searchrecords (
    id                       varchar2(36) primary key,
    query_text               varchar2(2000) not null,
    recognized_entities      json,
    expected_report_id       varchar2(36),
    expected_drilldown_id    varchar2(36),
    -- Array of objects to store top k match reports and 
    -- corresponding drilldown/filter values
    report_matches           json not null, 
    --TRUE implies Positive feedback and FALSE implies Negative feedback
    feedback_rating          BOOLEAN, 
    feedback_comments        varchar2(2000),
    query_vector             vector(*,*),
    query_md5                varchar2(32) not null,
    search_type              varchar2(32) default 'Hierarchical' 
                                check (search_type in ('Hierarchical', 'Flat')),
    username                 varchar2(32) default 'anonymous' not null ,                    
    -- TODO: Add expected report id and description id?
    -- expected_report_id VARCHAR2(36) NOT NULL, 
    required_feedback_action varchar2(20) default 'Pending Review'
                             check ( required_feedback_action 
                             in ( 'New Report',
                                'Update Report',
                                'Pending Review',
                                'None' ) ),
    feedback_action_priority varchar2(20) default 'low' 
                                check ( feedback_action_priority in ( 'low',
                                                                    'medium',
                                                                    'high' ) ),
    created_at               timestamp default current_timestamp,
    updated_at               timestamp default current_timestamp,
    domain_id                varchar2(36),
    plot_image               blob,
    augmented_query_text     varchar2(4000),
    augmented_tokens         json,
    question_id              varchar2(36),
    foreign key ( expected_report_id )
        references langdata$reports ( id )
            on delete set null,
    foreign key ( expected_drilldown_id )
        references langdata$drilldowndocuments ( id )
            on delete set null,
    constraint fk_search_domain foreign key (domain_id) 
        references langdata$domains(domain_id) 
            on delete set null,
    foreign key ( question_id )
        references langdata$question_stats ( question_id )
            on delete set null
);

/*
 Stores archived entries from langdata$searchrecords table.
 These records are considered old and are no longer active.
 The archive happens through an explicit request via the 
 lang_data_cleanup_pkg.archive_search_records procedure.
*/
create table langdata$searchrecords_archive (
    id                       varchar2(36) primary key,
    query_text               varchar2(2000) not null,
    expected_report_id       varchar2(36),
    expected_drilldown_id    varchar2(36),
    -- Array of objects to store top k match reports and 
    -- corresponding drilldown/filter values
    report_matches           json not null, 
    feedback_rating          number,
    feedback_comments        varchar2(2000),
    search_type              varchar2(32),
    username                 varchar2(32),
    required_feedback_action varchar2(20),
    feedback_action_priority varchar2(20),
    created_at               timestamp,
    updated_at               timestamp,
    archived_at              timestamp default current_timestamp,
    augmented_query_text     varchar2(4000),
    foreign key ( expected_report_id )
        references langdata$reports ( id )
            on delete set null,
    foreign key ( expected_drilldown_id )
        references langdata$drilldowndocuments ( id )
            on delete set null
);

create index idx_langdata_searchrecords_created_at 
on langdata$searchrecords (created_at);
create index idx_searchrecords_md5 on langdata$searchrecords(query_md5);

CREATE TABLE langdata$schemaversion (
    version_number VARCHAR2(50) PRIMARY KEY,
    description VARCHAR2(255),
    created_at timestamp DEFAULT current_timestamp ,
    --format 23.0.0.1
    CONSTRAINT  check_version_format CHECK(REGEXP_LIKE(version_number, 
                                            '^\d{1,2}(\.\d{1,2}){3}$'))
);

create table langdata$samplequeries (
    id                    varchar2(36) primary key,
    query_text            varchar2(2000) not null,
    query_vector          vector(*,*),
    version               number,
    report_id             varchar2(36) not null,
    drilldown_id          varchar2(36),
    created_at            timestamp default current_timestamp,
    updated_at            timestamp default current_timestamp,
    query_md5             varchar2(32) not null,
    enhanced_query_text  varchar2(4000),
    augmented_tokens      json,
    regression_json       json DEFAULT NULL,
    status                varchar2(20) default 'Published' 
                          check ( status in ( 'Pending Review',
                                'Approved',
                                'Rejected',
                                'Published',
                                'Inactive',
                                'Archived',
                                'Pending Regression') ),
    foreign key ( report_id )
        references langdata$reports ( id )
            on delete cascade,
    foreign key ( drilldown_id )
        references langdata$drilldowndocuments ( id )
            on delete cascade,
    unique ( report_id,
             version )
);
create index idx_langdata_samplequeries_created_at 
on langdata$samplequeries (created_at);
create unique index idx_samplequeries_md5 on langdata$samplequeries(query_md5);

create table langdata$reportquerycluster (
    cluster_id      varchar2(36) primary key,
    report_id       varchar2(36) not null,
    centroid_vector vector(*,*),
    foreign key ( report_id )
        references langdata$reports ( id )
            on delete cascade
);

create table langdata$drilldownquerycluster (
    cluster_id      varchar2(36) primary key,
    drilldown_id    varchar2(36) not null,
    centroid_vector vector(*,*),
    foreign key ( drilldown_id )
        references langdata$drilldowndocuments ( id )
            on delete cascade
);

create table langdata$named_entities (
    id   varchar2(36) primary key,
    name varchar2(255) not null unique
);

CREATE TABLE langdata$jobs_history (
    object_name     VARCHAR2(128),
    table_name      VARCHAR2(128),
    column_name     VARCHAR2(128),
    schema_name     VARCHAR2(128),
    db_link_name    VARCHAR2(255),
    job_name        VARCHAR2(250),
    updated_at      TIMESTAMP DEFAULT current_timestamp
);

CREATE TABLE langdata$config (
    variable_name   VARCHAR2(100) PRIMARY KEY,
    variable_value  VARCHAR2(200)
);

CREATE TABLE langdata$stoplist (
    spw_word VARCHAR2(255) PRIMARY KEY,
    source VARCHAR2(50) DEFAULT 'LANGDATA'
);

-- This table is to cache identified entities, and entities w/ context
-- during the execution of search_from_query
CREATE GLOBAL TEMPORARY TABLE langdata$nerstate (
    entity_type             VARCHAR2(100),
    entities                JSON,
    context_embeddings      CLOB
) ON COMMIT DELETE ROWS;

-- This table is to cache identified filter values for each filter
-- during the execution of search_from_query
CREATE GLOBAL TEMPORARY TABLE langdata$filtervaluestemp (
    filter_name          VARCHAR2(255),
    filter_value         VARCHAR2(4000),
    reason               VARCHAR2(4000)
) ON COMMIT DELETE ROWS;

CREATE TABLE langdata$value_vector_metadata (
    vvec_table_name     VARCHAR2(128) PRIMARY KEY,  --Name of value vector table
    table_name          VARCHAR2(128) NOT NULL,     --Associated table
    column_name         VARCHAR2(128) NOT NULL,     --Associated column
    schema_name         VARCHAR2(128) NOT NULL,     --Associated schema
    db_link_name        VARCHAR2(255),
    ref_count           NUMBER DEFAULT 0,           --Usage reference count
    annotation_value    VARCHAR2(4000),
    comment_value       VARCHAR2(4000),
    annotation_changed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    comment_changed_at    TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    created_at          TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    UNIQUE(table_name, column_name, schema_name)
);

-- This table is to store reranking inputs for reranker
-- Its data is preserved in accross transactions in a session, isolated by 
-- sessions.
CREATE GLOBAL TEMPORARY TABLE langdata$re_ranking_inputs ( 
    query_text VARCHAR2(2000), 
    doc_text VARCHAR2(4000), 
    doc_text_id VARCHAR2(36)
) ON COMMIT PRESERVE ROWS;

-- This table logs the specific occurrences of all external APIs calls.
CREATE TABLE langdata$api_logs (
    -- The ID for this log entry
    id                VARCHAR2(36) PRIMARY KEY,
    -- Name of the api
    api_name          VARCHAR2(100),
    -- Time at which the API was called
    called_at         TIMESTAMP,
    -- Time taken for the call to be executed in centiseconds
    call_time         NUMBER,
    -- Failure status: TRUE if call failed and FALSE otherwise
    failure_status    BOOLEAN DEFAULT FALSE,
    -- Oracle error code
    failure_code      NUMBER DEFAULT NULL,
    -- Failure message if any
    failure_message   VARCHAR2(4000) DEFAULT NULL
);

CREATE TABLE langdata$plot_input (
    report_id      VARCHAR2(36),
    report_title   VARCHAR2(255),
    type           VARCHAR2(20), -- 'actual query' / 'description' / 'sample query'
    vector         VECTOR(*,*), 
    highlight      NUMBER DEFAULT 0
);

CREATE TABLE langdata$drilldownvalues (
    value           VARCHAR2(4000),
    vvec            VECTOR(*,*),
    partition_name  VARCHAR2(128) NOT NULL
)
PARTITION BY LIST (partition_name)
(
    PARTITION P_DUMMY VALUES ('DUMMY') -- placeholder, will be dropped later
);

CREATE INDEX langdata$drilldownvalues_ctx_idx
ON langdata$drilldownvalues(value)
INDEXTYPE IS CTXSYS.CONTEXT;

CREATE TABLE langdata$value_vector_partition_descriptions (
    partition_name   VARCHAR2(128) PRIMARY KEY,
    description            VARCHAR2(4000),
    comment_text           VARCHAR2(4000),
    desc_comment_combined  VARCHAR2(4000),
    desc_comment_vector    VECTOR(*, *),
    domain_id              VARCHAR2(36),
    CONSTRAINT fk_vvec_partition_domain FOREIGN KEY (domain_id) 
        REFERENCES langdata$domains(domain_id) 
            ON DELETE SET NULL
);

CREATE TABLE langdata$ner_metrics (
    metric_name VARCHAR2(100) PRIMARY KEY,
    metric_value NUMBER
);  

-- This table is needed for IG of apex filters page
CREATE TABLE langdata$apex_filters (
    filter_name             VARCHAR2(100),
    default_value           VARCHAR2(100),
    use_ner                 BOOLEAN,
    entity_type             VARCHAR2(100),
    table_name              VARCHAR2(100),
    column_name             VARCHAR2(100),
    schema_name             VARCHAR2(100),
    db_link_name            VARCHAR2(100),
    enumerable_set          JSON,
    description             VARCHAR2(4000),
    session_id              VARCHAR2(36),
    created_at              TIMESTAMP DEFAULT SYSTIMESTAMP
);
