//
// See the file LICENSE for redistribution information.
//
// Copyright (c) 2002-2003
//	Sleepycat Software.  All rights reserved.
//

static const char revid[] = "$Id: Indexer.cpp,v 1.78 2003/10/13 16:39:44 merrells Exp $";

#include "dbxml_config.h"
#include "dbxml/XmlPortability.hpp"
#include "dbxml/XmlException.hpp"
#include "Indexer.hpp"
#include "Log.hpp"
#include "Container.hpp"
#include "SyntaxManager.hpp"
#include "UTF8.hpp"
#include "Document.hpp"

#if defined(DBXML_DOM_XERCES2)
#include <xercesc/util/PlatformUtils.hpp>
#include <xercesc/framework/MemBufInputSource.hpp>
#include <xercesc/sax2/SAX2XMLReader.hpp>
#include <xercesc/sax2/XMLReaderFactory.hpp>
#if defined(XERCES_HAS_CPP_NAMESPACE)
  XERCES_CPP_NAMESPACE_USE
#endif
#endif

#include <sstream>
#include <string>
#include <map>
#include <algorithm>
#include <iostream>

using namespace DbXml;
using namespace std;

// Transcoder

Transcoder::Transcoder()
	: transcoder_(0)
{
	XMLPlatformUtils::Initialize();
	XMLTransService::Codes failReason;
#if XERCES_VERSION_MAJOR==2 && XERCES_VERSION_MINOR<=2
	transcoder_ = XMLPlatformUtils::fgTransService->makeNewTranscoderFor("UTF-8", failReason, 4 * 1024);
#elif XERCES_VERSION_MAJOR==2 && XERCES_VERSION_MINOR>=3
	transcoder_ = XMLPlatformUtils::fgTransService->makeNewTranscoderFor("UTF-8", failReason, 4 * 1024, XMLPlatformUtils::fgMemoryManager);
#endif
}

Transcoder::~Transcoder()
{
	delete transcoder_;
	XMLPlatformUtils::Terminate();
}

const char *Transcoder::transcodeName(XMLByteVector &b, const XMLCh* const uri, const XMLCh* const localname) const
{
	size_t uriLength = XMLString::stringLen(uri);
	size_t localnameLength = XMLString::stringLen(localname);
	const unsigned int needed = (uriLength + localnameLength) * 3 + 1 + 1; // 3 bytes per XMLCh is the worst case, + ':' + '\0'
	b.reserve(needed);
	unsigned int charsEaten = 0;
	size_t targetLength = 0;
	if (uriLength > 0) {
		targetLength = transcoder_->transcodeTo(uri, uriLength, &b[0], b.capacity(), charsEaten, XMLTranscoder::UnRep_Throw);
		b[targetLength++] = ':';
	}
	transcoder_->transcodeTo(localname, localnameLength + 1, &b[targetLength], b.capacity(), charsEaten, XMLTranscoder::UnRep_Throw);
	return (const char*)&b[0];
}

const char *Transcoder::transcode(XMLByteVector &b, const XMLCh* const s, size_t sourceLength, size_t &targetLength) const
{
	const unsigned int needed = sourceLength * 3; // 3 bytes per XMLCh is the worst case.
	b.reserve(needed);
	unsigned int offset = 0;
	unsigned int consumed = 0;
	while (sourceLength > consumed) {
		unsigned int charsEaten = 0;
		offset += transcoder_->transcodeTo(&s[consumed], sourceLength - consumed, &b[offset], b.capacity() - offset, charsEaten, XMLTranscoder::UnRep_Throw);
		consumed += charsEaten;
	}
	targetLength = offset;
	return (const char*)&b[0];
}

const char *Transcoder::transcode(XMLByteVector &b, const XMLCh* const s) const
{
	size_t sourceLength = XMLString::stringLen(s) + 1;
	size_t targetLength = 0;
	return transcode(b, s, sourceLength, targetLength);
}

// IndexerState

IndexerState::IndexerState()
	: key_(0),
	iv_(0),
	name_(0),
	value_(0)
{}

IndexerState::~IndexerState()
{
	delete key_;
	delete name_;
	delete value_;
}

void IndexerState::initialize()
{
	key_ = new Key;
	iv_ = 0;
	name_ = new XMLByteVector;
	value_ = new XMLByteVector;
}

void IndexerState::startNode(Container &container, const IndexSpecification &indexSpecification, DbTxn *txn, const Name &name)
{
	container.lookupName(txn, name, key_->getID1(), /*define=*/true);
	iv_ = indexSpecification.getIndexOrDefault(name.getURIName().c_str());
}

void IndexerState::startNode(const IndexSpecification &indexSpecification, const Transcoder &transcoder, const XMLCh* const uri, const XMLCh* const localname)
{
	key_->reset();
	const char *uriname = transcoder.transcodeName(*name_, uri, localname);
	iv_ = indexSpecification.getIndexOrDefault(uriname);
}

void IndexerState::characters(const char *s, size_t l)
{
	key_->addValue(s, l);
}

void IndexerState::characters(const Transcoder &transcoder, const XMLCh* const chars, const unsigned int length)
{
	size_t l = 0;
	const char *s = transcoder.transcode(*value_, chars, length, l);
	key_->addValue(s, l);
}

void IndexerState::characters(const Transcoder &transcoder, const XMLCh* const chars)
{
	characters(transcoder, chars, XMLString::stringLen(chars));
}

bool IndexerState::isIndexed() const
{
	return iv_ && iv_->isIndexed();
}

Key &IndexerState::getKey(Container &container, DbTxn *txn)
{
	ID &id = key_->getID1();
	if (id == 0) {
		const char *uriname = (const char *) & ((*name_)[0]);
		container.lookupName(txn, uriname, id, /*define=*/true);
	}
	return *key_;
}

void IndexerState::reset()
{
	key_->reset();
	iv_ = 0;
}

// IndexerStateStack

IndexerStateStack::IndexerStateStack()
	: top_(0)
{
	v_.reserve(16);
}

IndexerStateStack::~IndexerStateStack()
{
	std::vector<IndexerState*>::iterator i;
	for(i = v_.begin(); i != v_.end(); ++i) {
		delete *i;
	}
}

IndexerState *IndexerStateStack::push()
{
	IndexerState *is = 0;
	if (top_ == v_.size()) {
		v_.push_back(new IndexerState);
		is = v_[top_];
		is->initialize();
		++top_;
	} else {
		is = v_[top_];
		++top_;
		is->reset();
	}
	return is;
}

IndexerState *IndexerStateStack::top()
{
	IndexerState *is = 0;
	if (top_ != 0) {
		is = v_[top_ -1];
	}
	return is;
}

void IndexerStateStack::pop()
{
	if (top_ != 0) {
		--top_;
	}
}

bool IndexerStateStack::empty()
{
	return (top_ == 0);
}

// Indexer

Indexer::Indexer(Container *container)
	: parser_(0),
	txn_(0),
	container_(container),
	indexSpecification_(0),
	attributesIndexed_(false),
	elementsIndexed_(false),
	document_(0),
	tracing_(isLogEnabled(C_INDEXER, L_DEBUG)) // This only applies to the debug build, but it clarifies profiling.
{
	XMLPlatformUtils::Initialize();
	parser_ = XMLReaderFactory::createXMLReader();
	parser_->setProperty(XMLUni::fgXercesScannerName, (void*)XMLUni::fgWFXMLScanner);
	parser_->setFeature(XMLUni::fgSAX2CoreValidation, false);
	parser_->setFeature(XMLUni::fgSAX2CoreNameSpaces, true);
	parser_->setFeature(XMLUni::fgXercesLoadExternalDTD, false);
	parser_->setFeature(XMLUni::fgXercesSchema, false);
	parser_->setContentHandler(this);
	parser_->setErrorHandler(this);
}

Indexer::~Indexer()
{
	delete parser_;
	XMLPlatformUtils::Terminate();
}

int Indexer::indexDocument(DbTxn *txn, const IndexSpecification &indexSpecification, const Document& document) // throws XmlException
{
	int err = 0;
	txn_ = txn;
	document_ = &document;
	indexSpecification_ = &indexSpecification;
	attributesIndexed_ = indexSpecification_->isIndexed(Index::NODE_ATTRIBUTE, Index::NODE_MASK);
	elementsIndexed_ = indexSpecification_->isIndexed(Index::NODE_ELEMENT, Index::NODE_MASK);

	if (tracing_) {
		ostringstream oss;
		oss << "Creating keys for document '" << document_->getName() << "'";
		container_->log(C_INDEXER, L_DEBUG, oss);
	}
	if (attributesIndexed_) {
		//
		// Build index keys for the document meta-data attributes.
		// Note that edge keys aren't generated for them.
		//
		IndexerState *ais = stateStack_.push();
		MetaData::const_iterator i;
		for (i = document_->metaDataBegin();i != document_->metaDataEnd();++i) {
			const MetaDatum *md = *i;
			if (md->insertIntoDocument()) {
				const Name &mdname = md->getName();
				ais->startNode(*container_, *indexSpecification_, txn, mdname);
				if (ais->isIndexed()) {
					ais->characters((const char*)md->getDbt()->get_data(), md->getDbt()->get_size() - 1);
					addAttributeValueKey(0, ais);
				}
			}
		}
		stateStack_.pop();
	}
	//
	// Parse the document to generate a set of index keys.
	//
	const Dbt *dbt = document_->getContent();
	parse((const char*)dbt->get_data(), dbt->get_size()); // throws XmlException
	return err;
}

int Indexer::addOrDeleteKeys(OperationContext &context, bool add)
{
	int err = 0;
	try {
		//
		// Now add the keys to the index, and update the statistics.
		//
		if (tracing_) {
			ostringstream oss;
			if (add)
				oss << "Adding keys for document '" << document_->getName() << "'";
			else
				oss << "Deleting keys for document '" << document_->getName() << "'";
			container_->log(C_INDEXER, L_DEBUG, oss);
		}
		//
		// We iterate over the key stash, adding, or deleting the keys from the
		// indexes. The key buffer comes out of the key stash with the correct
		// endianness.
		//
		KeyStash::iterator i;
		void *keyBuffer = 0;
		size_t keyLength = 0;
		Index index = 0;
		keyBuffer = keyStash_.first(i, keyLength, index);
		while (keyBuffer != 0 && err == 0) {
			// Add each key to the index.
			//
			bool addKey =  add && index.indexerAdd();
			if (tracing_) {
				Buffer b(keyBuffer, keyLength);
				ostringstream oss;
				if (addKey)
					oss << "add " << index.asString() << " " << b.asString().c_str();
				else
					oss << "delete " << index.asString() << " " << b.asString().c_str();
				container_->log(C_INDEXER, L_DEBUG, oss);
			}
			bool isEqualityKey = (index.getKey() == Index::KEY_EQUALITY);
			Syntax::Type syntaxType = (Syntax::Type)index.getSyntax();
			SecondaryDatabase *database = container_->getIndex(syntaxType);
			bool duplicate = false;
			if (addKey) {
				err = database->putID(context, keyBuffer, keyLength, document_->getID(), (isEqualityKey ? &duplicate : 0)); // endianness of key buffer is correct
			} else {
				err = database->delID(context, keyBuffer, keyLength, document_->getID(), (isEqualityKey ? &duplicate : 0)); // endianness of key buffer is correct
			}
			if (err == 0) {
				const Syntax *syntax = SyntaxManager::getInstance()->getSyntax(syntaxType);
				statistics_.addToKeyStatistics(*syntax, index, keyBuffer, keyLength, isEqualityKey && !duplicate, addKey); // endianness of key buffer is correct
			}
			keyBuffer = keyStash_.next(i, keyLength, index);
		}
		//
		// Update the index key statistics stored in the container.
		//
		if (err == 0) {
			err = statistics_.updateContainer(context, *container_);
		}
	} catch (...) {
		//
		// Tidy up the Indexer, so that it can be used again.
		//
		throw; // rethrow the exception
	}
	return err;
}

void Indexer::reset()
{
	//
	// Cleanup
	//
	txn_ = 0;
	document_ = 0;
	indexSpecification_ = 0;
	keyStash_.reset();
	statistics_.reset();
	while (!stateStack_.empty())
		stateStack_.pop();
}

void Indexer::startElement(const XMLCh* const uri, const XMLCh* const localname, const XMLCh* const qname, const Attributes& attributes)
{
	IndexerState *eis = stateStack_.push();
	if (elementsIndexed_ || attributesIndexed_) {
		eis->startNode(*indexSpecification_, transcoder_, uri, localname);
		if (attributesIndexed_ && attributes.getLength() > 0) {
			//
			// node-attribute-presence
			// edge-attribute-presence
			// edge-attribute-equality-*
			// node-attribute-equality-*
			// edge-attribute-substring-*
			// node-attribute-substring-*
			//
			IndexerState *ais = stateStack_.push();
			for (unsigned int i = 0; i < attributes.getLength(); ++i) {
				ais->startNode(*indexSpecification_, transcoder_, attributes.getURI(i), attributes.getLocalName(i));
				if (ais->isIndexed()) {
					ais->characters(transcoder_, attributes.getValue(i));
					addAttributeValueKey(eis, ais);
				}
			}
			stateStack_.pop();
		}
	}
}

void Indexer::addAttributeValueKey(IndexerState *eis, IndexerState *ais)
{
	//
	// node-attribute-*
	//
	Key &k = ais->getKey(*container_, txn_);
	k.setIndex(Index::NA);
	generateKeys(ais->iv(), Index::NA, Index::PN_MASK, k);
	//
	// edge-attribute-*
	//
	if (eis != 0) {
		ID aid = k.getID1();
		k.set(Index::EA, eis->getKey(*container_, txn_).getID1(), aid);
		generateKeys(ais->iv(), Index::EA, Index::PN_MASK, k);
	}
}

/**
 * Note - Elements with multiple text nodes are concatenated
 * into a single key. Eg <a>x<b>y</b>z</a> is a=xy, b=y
 */
void Indexer::characters(const XMLCh* const chars, const unsigned int length)
{
	if (elementsIndexed_ && length > 0) {
		IndexerState *is = stateStack_.top();
		//
		// *-element-equality-*
		// *-element-substring-*
		//
		if (is->isIndexed() &&
		    (is->iv().isEnabled(Index::XEE, Index::NK_MASK) ||
		     is->iv().isEnabled(Index::XES, Index::NK_MASK))) {
			// Note that an element value can be split into multiple
			// calls to characterData. First time past the key goes
			// from Presence to Equality, there after the data is
			// just appended to the Equality key.
			//
			// Note that we do not trim leading whitespace.
			//
			is->characters(transcoder_, chars, length);
		}
	}
}

void Indexer::endElement(const XMLCh* const uri, const XMLCh* const localname, const XMLCh* const qname)
{
	UNUSED(uri);
	UNUSED(localname);
	UNUSED(qname);

	IndexerState *cis = 0; // child indexer state
	if (elementsIndexed_) {
		//
		// node-element-*-*
		//
		cis = stateStack_.top();
		if (cis->isIndexed()) {
			Key &k = cis->getKey(*container_, txn_);
			generateKeys(cis->iv(), Index::NE, Index::PN_MASK, k);
		}
	}
	stateStack_.pop();
	if (elementsIndexed_ && !stateStack_.empty() && cis->isIndexed()) {
		//
		// edge-element-*-*
		//
		IndexerState *pis = stateStack_.top(); // parent indexer state
		Key &k = cis->getKey(*container_, txn_);
		k.setID2(k.getID1());
		k.setID1(pis->getKey(*container_, txn_).getID1());
		generateKeys(cis->iv(), Index::EE, Index::PN_MASK, k);
	}
}

void Indexer::generateKeys(const IndexVector &iv, Index::Type pnk, Index::Type mask, Key &key)
{
	// Iterate over the index vector looking for indexes that match
	// the path-node-key provided (pnk). For each syntax create the
	// keys for that syntax. Note that the key passed through is
	// reused...
	//
	int i = 0;
	Index index;
	const Syntax *syntax = 0;
	syntax = iv.getNextSyntax(i, pnk, mask, index);
	while (syntax != 0) {
		key.setIndex(index);
		if (syntax->test(key.getValue(), key.getValueSize())) {
			if (tracing_) {
				ostringstream oss;
				oss << key.asString().c_str();
				container_->log(C_INDEXER, L_DEBUG, oss);
			}
			key.addKeyToStash(keyStash_, container_->swap());
		}
		syntax = iv.getNextSyntax(i, pnk, mask, index);
	}
}

void Indexer::parse(const char *buffer, size_t length) // throws XmlException
{
	try {
		MemBufInputSource theInputSource((XMLByte*)buffer, length, document_->getName().c_str(), false);
		parser_->parse(theInputSource);
	} catch (const XMLException &e) {
		std::ostringstream s;
		s << "XML Indexer: Parse error in document '";
		s << document_->getName();
		s << ". Parser Error Message: " << XMLChToUTF8(e.getMessage()).str();
		throw XmlException(XmlException::INDEXER_PARSER_ERROR, s.str());
	}
}

void Indexer::error(const SAXParseException& e)
{
	std::ostringstream s;
	s << "XML Indexer: Parse error in document '";
	s << document_->getName();
	s << "', line " << e.getLineNumber();
	s << ", char " << e.getColumnNumber();
	s << ". Parser Error Message: " << XMLChToUTF8(e.getMessage()).str();
	throw XmlException(XmlException::INDEXER_PARSER_ERROR, s.str());
}

void Indexer::fatalError(const SAXParseException& e)
{
	std::ostringstream s;
	s << "XML Indexer: Fatal parse error in document '";
	s << document_->getName();
	s << "', line " << e.getLineNumber();
	s << ", char " << e.getColumnNumber();
	s << ". Parser Error Message: " << XMLChToUTF8(e.getMessage()).str();
	throw XmlException(XmlException::INDEXER_PARSER_ERROR, s.str());
}

void Indexer::warning(const SAXParseException& e)
{
	std::ostringstream s;
	s << "XML Indexer: Parse warning in document '";
	s << document_->getName();
	s << "', line " << e.getLineNumber();
	s << ", char " << e.getColumnNumber();
	s << ". Parser Error Message: " << XMLChToUTF8(e.getMessage()).str();
	throw XmlException(XmlException::INDEXER_PARSER_ERROR, s.str());
}
