import subprocess
import sys
import pandas as pd
import numpy as np
import math
from sklearn.feature_extraction.text import TfidfVectorizer

# List of required packages (for documentation or verification purposes)
REQUIRED_PACKAGES = [ "pandas", "numpy"]

def install_packages():
    """Install missing required packages."""
    for package in REQUIRED_PACKAGES:
        try:
            __import__(package)
        except ImportError:
            print(f"Package '{package}' is missing. Attempting to install...")
            try:
                subprocess.check_call([sys.executable, "-m", "pip", "install", package])
            except Exception as e:
                print(f"Failed to install package '{package}': {e}")

# Ensure required packages are installed
install_packages()




def TfidfVectorize_documents(documents, threshold=0.25, min_df=1, max_df=1.0, ngram_size=(1, 1)):
    """
    Process documents to compute TF-IDF and filter based on a threshold.
    
    Parameters:
    - documents: List of text documents.
    - threshold: TF-IDF score threshold for filtering.
    - min_df: Minimum document frequency for terms.
    - max_df: Maximum document frequency for terms.
    - ngram_size: Tuple for n-gram size.
    
    Returns:
    - DataFrame containing original and filtered texts.
    - TF-IDF DataFrame.
    """
    
    # Initialize the TF-IDF Vectorizer with specified parameters
    vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df, ngram_range=ngram_size)
    
    # Fit and transform the documents
    tfidf_matrix = vectorizer.fit_transform(documents)
    
    # Get the feature names
    feature_names = vectorizer.get_feature_names_out()
    
    # Create a DataFrame from the TF-IDF matrix
    df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)
    
    # Create a new DataFrame to hold filtered texts
    filtered_texts = []

    for i, doc in enumerate(documents):
        # Tokenize the original document
        words = doc.split()
        
        # Get significant words based on the TF-IDF score
        significant_words = df_tfidf.iloc[i][df_tfidf.iloc[i] > threshold].index.tolist()
        
        # Filter words to maintain the original sequence
        filtered_text = ' '.join([word for word in words if word.lower() in significant_words])
        filtered_texts.append(filtered_text)

    # Create a new DataFrame with the original and filtered texts
    # df_filtered = pd.DataFrame({'Original Text': documents, 'Filtered Text': filtered_texts})
    
    return filtered_texts, df_tfidf






def convert_label_to_vector(list_label, dict_vector):
    dict_vector_output = dict_vector.copy()
    for item in list_label:
        dict_vector_output[item] = 1
    return list(dict_vector_output.values())

def elementwise_sum_list(list_of_vectors):
    if not list_of_vectors:
        return []
    vector_length = len(list_of_vectors[0])
    if any(len(vec) != vector_length for vec in list_of_vectors):
        raise ValueError("All vectors must have the same length.")
    return [sum(values) for values in zip(*list_of_vectors)]

def aggregate_vectors(df):
    return df.groupby("Emj")["EmoVector"].apply(lambda x: [sum(v) for v in zip(*x)]).reset_index()

def normalize_vector(vector):
    if not vector:
        raise ValueError("The vector is empty.")
    min_val, max_val = min(vector), max(vector)
    if min_val == max_val:
        return vector
    return [(x - min_val) / (max_val - min_val) for x in vector]

def discretize_vector(vector, cutoff_input):
    if not vector:
        raise ValueError("The vector is empty.")
    
    # Check if all elements are zero
    if all(x == 0 for x in vector):
        return [0.0] * len(vector)  # Return a vector of zeros of the same length
        
    cutoff_value = cutoff_input if isinstance(cutoff_input, (int, float)) else sum(vector) / len(vector)
    return [1 if x >= cutoff_value else 0 for x in vector]

def convert_vector_to_dict(list_vector, dict_label_0):
    return dict(zip(dict_label_0.keys(), list_vector))
