import subprocess
import sys
import os
import matplotlib.pyplot as plt
import pickle
import datetime
from tqdm import tqdm
from gensim.models import CoherenceModel, LdaModel
from gensim.corpora import Dictionary


REQUIRED_PACKAGES = ["gensim","matplotlib","pickle","tqdm","os","datetime"]
def install_packages():
    """Install missing required packages."""
    for package in REQUIRED_PACKAGES:
        try:
            __import__(package)
        except ImportError:
            print(f"Package '{package}' is missing. Attempting to install...")
            try:
                subprocess.check_call([sys.executable, "-m", "pip", "install", package])
            except Exception as e:
                print(f"Failed to install package '{package}': {e}")
# Ensure required packages are installed
install_packages()

# Function to analyze topics using LDA
def explore_topic_modeling(
    ngram_texts, path_dset,dset_name, min_topics=2, max_topics=10, step=1, no_below=3, no_above=0.8
):
    """
    Perform coherence test and create LDA topic modeling, saving outputs with a timestamp.
    
    Args:
        ngram_texts (list): Processed text data (unigrams + bigrams).
        path_dset (str): Path where output files will be saved.
        min_topics (int): Minimum number of topics to evaluate.
        max_topics (int): Maximum number of topics to evaluate.
        step (int): Step size for the number of topics.
        no_below (int): Remove words that appear in fewer than `no_below` documents.
        no_above (float): Remove words that appear in more than `no_above` fraction of documents.

    Returns:
        dict: Paths of saved outputs.
    """
    # Generate timestamp
    timestamp = datetime.datetime.now().strftime("%Y%m%dT%H%M")
    
    # Create output directory
    output_dir = os.path.join(path_dset)
    os.makedirs(output_dir, exist_ok=True)

    # Create dictionary & corpus
    dictionary = Dictionary(ngram_texts)
    dictionary.filter_extremes(no_below=no_below, no_above=no_above)
    corpus = [dictionary.doc2bow(text) for text in ngram_texts]

    # Save dictionary & corpus
    dictionary_path = os.path.join(output_dir, f"{dset_name}_dictionary_{timestamp}.dict")
    corpus_path = os.path.join(output_dir, f"{dset_name}_corpus_{timestamp}.pkl")
    dictionary.save(dictionary_path)
    with open(corpus_path, "wb") as f:
        pickle.dump(corpus, f)
    
    # Compute coherence scores
    coherence_values = []
    topic_range = range(min_topics, max_topics + 1, step)
    
    for num_topics in tqdm(topic_range):
        lda_model = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, random_state=42, passes=10)
        coherence_model = CoherenceModel(model=lda_model, texts=ngram_texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherence_model.get_coherence())

    # Find optimal number of topics
    optimal_topic_no = topic_range[coherence_values.index(max(coherence_values))]
    
    # Save coherence values
    coherence_path = os.path.join(output_dir, f"{dset_name}_coherence_scores_{timestamp}.pkl")
    with open(coherence_path, "wb") as f:
        pickle.dump(coherence_values, f)

    # Plot coherence scores
    plt.figure(figsize=(10, 5))
    plt.plot(topic_range, coherence_values, marker="o", label="Coherence Score")
    plt.xlabel("Number of Topics")
    plt.ylabel("Coherence Score")
    plt.title("Coherence Score vs. Number of Topics")
    plt.legend()
    plt.grid()
    
    # Save coherence plot
    coherence_plot_path = os.path.join(output_dir, f"{dset_name}_coherence_plot_{timestamp}.png")
    plt.savefig(coherence_plot_path)
    plt.close()

    # Create final LDA model with the optimal number of topics
    lda_model = LdaModel(corpus=corpus, num_topics=optimal_topic_no, id2word=dictionary, random_state=42, passes=20)

    # Save LDA Model
    lda_model_path = os.path.join(output_dir, f"{dset_name}_lda_model_{timestamp}.model")
    lda_model.save(lda_model_path)

    # Return output paths
    return {
        "dictionary": dictionary_path,
        "corpus": corpus_path,
        "lda_model": lda_model_path,
        "coherence_scores": coherence_path,
        "coherence_plot": coherence_plot_path,
        "optimal_topics": optimal_topic_no
    }

