import subprocess
import sys
import pandas as pd
import numpy as np
import math
from itertools import combinations
from collections import Counter, defaultdict

# List of required packages (for documentation or verification purposes)
REQUIRED_PACKAGES = ["pandas", "numpy"]

def install_packages():
    """Install missing required packages."""
    for package in REQUIRED_PACKAGES:
        try:
            __import__(package)
        except ImportError:
            print(f"Package '{package}' is missing. Attempting to install...")
            try:
                subprocess.check_call([sys.executable, "-m", "pip", "install", package])
            except Exception as e:
                print(f"Failed to install package '{package}': {e}")

# Ensure required packages are installed
install_packages()

def construct_list_EmoTokenPair(list_list_EmoTokenPair):
    return [item for sublist in list_list_EmoTokenPair for item in sublist]

def merge_sublist(list_sublist):
    return [item for sublist in list_sublist for item in sublist]

def construct_df_EmoTokenPair(list_EmoTokenPair, columns=['EmoToken', 'Emj'], sortby=''):
    df = pd.DataFrame(list_EmoTokenPair, columns=columns)
    if sortby:
        df = df.sort_values(by=sortby, ascending=True)
    df.reset_index(drop=True, inplace=True)
    return df

def quick_preview_df_EmoTokenPair(df, analysis_mode, key_colm_name='', keyword=None, value_colm_name=''):
    if analysis_mode == 'search_keyword' and key_colm_name and keyword is not None:
        print(len(df[df[key_colm_name] == keyword]))
        print(df[df[key_colm_name] == keyword])
    
    def count_items(x):
        counts = Counter(x)
        sorted_counts = sorted(counts.items(), key=lambda item: item[1], reverse=True)
        return ", ".join(f"{item}({count})" for item, count in sorted_counts)
    
    if analysis_mode == 'group_by_key' and key_colm_name and keyword is not None and value_colm_name:
        df_GroupbyEmoToken = df.groupby(key_colm_name).agg(
            Key=(value_colm_name, lambda x: ", ".join(set(x))),
            Key_DistinctCount=(value_colm_name, "nunique"),
            Key_Frequency=(value_colm_name, count_items),
        ).reset_index()
        print(len(df_GroupbyEmoToken))
        print(df_GroupbyEmoToken[df_GroupbyEmoToken[key_colm_name] == keyword])

def calculate_emotokenpair_count_pmi_npmi(pairs):
    pair_counts = Counter(pairs)
    token_counts = Counter(t for t, e in pairs)
    emoji_counts = Counter(e for t, e in pairs)
    total = sum(pair_counts.values())  # same as len(pairs) for flat lists

    results = []
    for (token, emoji), cooccur in pair_counts.items():
        p_te = cooccur / total
        p_t = token_counts[token] / total
        p_e = emoji_counts[emoji] / total

        if p_te > 0 and p_t > 0 and p_e > 0:
            pmi = math.log(p_te / (p_t * p_e), 2)  # log base 2
            npmi = pmi / -math.log(p_te, 2)
        else:
            pmi, npmi = float('-inf'), -1.0

        results.append((token, emoji, cooccur, pmi, npmi))

    return results


def convert_list_npmi_to_df(list_emotoken_npmi, columns=['EmoToken', 'Emj', 'Count', 'PMI', 'NPMI'], sortby=''):
    df_EmoTokenPair_Npmi = pd.DataFrame(list_emotoken_npmi, columns=columns)
    if sortby:
        df_EmoTokenPair_Npmi = df_EmoTokenPair_Npmi.sort_values(by=sortby, ascending=True)
    df_EmoTokenPair_Npmi.reset_index(drop=True, inplace=True)
    return df_EmoTokenPair_Npmi




