package org.troy.capstone.search_engine.query;

import java.io.IOException;
import java.util.Map;
import java.util.Scanner;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.StopFilterFactory;
import org.apache.lucene.analysis.custom.CustomAnalyzer;
import org.apache.lucene.analysis.ngram.NGramTokenizerFactory;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.StoredFields;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.store.ByteBuffersDirectory;
import org.apache.lucene.store.Directory;
import org.troy.capstone.constants.DataPath;
import org.troy.capstone.constants.TableColumnName;
import org.troy.capstone.utils.TableUtils;

import tech.tablesaw.api.Row;
import tech.tablesaw.api.Table;

public class MyBM25 {
    public static void main(String[] args) throws Exception{
        //Lucene was used as a helper source [4].
        //Highly optimized, better than if I implemented the algorithms myself due to optimizations, data structures
        //[5] was used for building the analyzer.
        //Base code source is [6].

        //NgramTokenizer tokenizes to get tokens to be all substrings of length 2-5, gives better typo tolerance and partial matching
        //StopFilter Filters out common english words, also splits on non-letter cars and sets all tokens to lowercase
        Analyzer ngramAnalyzer = CustomAnalyzer.builder()
            .withTokenizer(NGramTokenizerFactory.class, "minGramSize", "2", "maxGramSize", "5")
            .addTokenFilter(StopFilterFactory.class)
            .build();

        /**
         * ByteBuffersDirectory chosen to keep data in RAM for speed and simplicity
         * All data in RAM
         */
        Directory directory = new ByteBuffersDirectory();

        IndexWriterConfig config = new IndexWriterConfig(ngramAnalyzer);

        System.out.println("Indexing data and building search engine...");
        /**
         * BM25 used intead of default TF-IDF for better relevance scoring,
         * saturation effect, length normalization, better ranking quality,
         * and widely used in modern search engines.
         */
        config.setSimilarity(new BM25Similarity());
        Table table = TableUtils.readData(DataPath.CLEANED_ATTRIBUTED_DATA);
        try(IndexWriter writer = new IndexWriter(directory, config)) {
            for (Row row : table) {
                addDoc(writer,
                        row.getString(TableColumnName.ID.getColumnName()),
                        row.getString(TableColumnName.NAME.getColumnName()),
                        row.getString(TableColumnName.DESCRIPTION.getColumnName())
                );
            }
        }
        System.out.println("Indexing complete. You can now enter search queries.");

        IndexReader reader = DirectoryReader.open(directory);
        IndexSearcher searcher = new IndexSearcher(reader);
        searcher.setSimilarity(new BM25Similarity());
        
        //Boosts the name field
        String[] fields = {"name", "description"};
        Map<String, Float> boosts = Map.of(
            "name", 2.0f,
            "description", 1.0f
        );
        
        MultiFieldQueryParser parser = new MultiFieldQueryParser(fields, ngramAnalyzer, boosts);

        //Set default operator to OR for better recall, so that if any of the terms match, it will be included in results
        parser.setDefaultOperator(MultiFieldQueryParser.Operator.OR);

        try (Scanner scan = new Scanner(System.in)) {
            while (true) {
                System.out.print("Enter search query (or 'exit' to quit): ");
                String userQuery = scan.nextLine();
                if( userQuery.trim().equals("exit") )
                    break;
                
                System.out.println("Searching for: '" + userQuery + "' (using n-gram analysis)");
                
                //N-gram parsing handles both exact matches and typos automatically
                Query query = parser.parse(userQuery);
                TopDocs results = searcher.search(query, 1000);
                
                //Allowing scores that are at least 20% of the top score to be included
                double minimumAllowableScore = results.scoreDocs.length > 0 ? results.scoreDocs[0].score * 0.2 : 0.0;
                
                System.out.println("Total Hits: " + results.scoreDocs.length + " for search term: '" + userQuery + "'");
                
                //Print results with names and scores side by side
                StoredFields storedFields = searcher.storedFields();
                System.out.println("Document Name | Score");
                System.out.println("----------------------------------------");
                
                int i;
                for (i = 0; i < results.scoreDocs.length; i++) {
                    try {
                        ScoreDoc scoreDoc = results.scoreDocs[i];
                        if (scoreDoc.score < minimumAllowableScore)
                            break;
                        Document doc = storedFields.document(scoreDoc.doc);
                        String name = doc.get("name");
                        float score = scoreDoc.score;
                        System.out.printf("%-25s | %.4f%n", name, score);
                    } catch (IOException e) {
                        System.err.println("Error retrieving document: " + e.getMessage());
                    }
                }
                System.out.println("Displayed " + i + " results with scores above the threshold of " + minimumAllowableScore);
            }
        }
    }

    private static void addDoc(IndexWriter w, String id, String name, String desc) throws Exception {
        Document doc = new Document();
        doc.add(new StoredField("id", id)); //ID field - stored but not indexed for searching
        doc.add(new TextField("name", name, Field.Store.YES));    //Searchable and stored. Will be removed prior to use in main program
        doc.add(new TextField("description", desc, Field.Store.NO)); //Searchable but not stored  
        w.addDocument(doc);
    }
}
