﻿import os

import pandas as pd

primary_data = 'data/data_dle_psč.csv'
psc_obce_data = 'data/obce_psč_předpočítané.csv'
hierarchy_data = 'data/vazby-cr.csv'
address_folder = 'data/adresy/'


def preprocess_primary():
    df = pd.read_csv(primary_data, sep=';', encoding='utf-8')
    df = df.loc[df["Complete?"] == "Complete"]
    df = df.drop(columns=['Complete?'])
    df = df.replace({"muž": "M", "žena": "F"})

    return df

def preprocess_psc_obce():
    df = pd.read_csv(psc_obce_data, sep=',', encoding='utf-8')

    return df

def preprocess_hierarchy():
    df = pd.read_csv(hierarchy_data, sep=';', encoding='cp1250')
    df = df.drop(columns=['COBCE_KOD'])
    df = df.drop_duplicates()

    df['OKRES_KOD'] = df['OKRES_KOD'].fillna(9999).astype(int) # Praha

    # validate uniqueness of OBEC_KOD
    duplicate_obec_kod = df[df.duplicated(subset='OBEC_KOD', keep=False)]
    assert duplicate_obec_kod.empty, "Duplicate OBEC_KOD found"

    # validate correctly asssigning Praha to OKRES_KOD
    assert set(df[df['OKRES_KOD'] == 9999]['OBEC_KOD']) == {554782}, "Invalid OKRES_KOD for OBEC_KOD"

    return df


def aggregate_probabilities_by_hierarchy(df_zip_codes, df_hierarchy, psc_input, target_level):

    # Filter items for the given PSČ
    filtered_df = df_zip_codes[df_zip_codes['PSČ'] == psc_input]

    if filtered_df.empty:
        return {}  # Return an empty dict if the PSČ is not found

    # Calculate probabilities based on total_count
    probs_df = filtered_df[['Kód obce', 'total_count']].copy()
    total_sum = filtered_df['total_count'].sum()
    probs_df['probability'] = probs_df['total_count'] / total_sum

    # Merge with hierarchy to get target_level mappings
    merged_df = probs_df.merge(df_hierarchy, left_on='Kód obce', right_on='OBEC_KOD', how='left')

    if target_level not in df_hierarchy.columns:
        raise ValueError(f"Invalid target_level '{target_level}', must be one of: {list(df_hierarchy.columns[1:])}")

    # Aggregate probabilities by target_level
    aggregated_probabilities = merged_df.groupby(target_level)['probability'].sum().round(5)

    return list(aggregated_probabilities.items())


def compute_probabilities_POU(psc):
    return aggregate_probabilities_by_hierarchy(df_obce, df_hierarchy, psc, "POU_KOD")


def preprocess_zip_codes():
    all_files = [f for f in os.listdir(address_folder) if f.endswith('.csv')]
    df_list = [pd.read_csv(os.path.join(address_folder, file), sep=';', encoding='cp1250') for file in all_files]

    # missing vojenské újezdy 545422-boletice, 503941-libava with 0 address points
    # this triggers - FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated

    combined_df = pd.concat(df_list, ignore_index=True)
    combined_df = combined_df[['Kód obce', 'Typ SO', 'PSČ']]

    # Group by 'Kód obce' and calculate counts
    grouped_df = combined_df.groupby(['Kód obce', 'PSČ']).agg(
        cp_count=('Typ SO', lambda x: (x == 'č.p.').sum()),  # Count of 'č.p.'
        cev_count=('Typ SO', lambda x: (x == 'č.ev.').sum()),  # Count of 'č.ev.'
        total_count=('Typ SO', 'count')  # Total count of rows
    ).reset_index()

    return grouped_df



if __name__ == '__main__':
    df_primary = preprocess_primary()
    df_obce = preprocess_zip_codes()
    df_hierarchy = preprocess_hierarchy()

    ### V1
    # df_merged = df_primary.merge(df_obce, left_on='PSČ', right_on='PSČ', how='inner')
    # df_merged = df_merged.merge(df_hierarchy, left_on='Kód obce', right_on='OBEC_KOD', how='inner')
    # print(df_merged)
    #
    # df_grouped = df_merged.groupby(['OKRES_KOD']).size().reset_index(name='Count')
    # print(df_grouped)

    ### V2
    p = aggregate_probabilities_by_hierarchy(df_obce, df_hierarchy, 33011, 'POU_KOD')
    print(p)

    # df_primary['probabilities'] = df_primary["PSČ"].apply(compute_probabilities_POU)
    df_primary['probabilities'] = df_primary["PSČ"].apply(
            lambda psc: aggregate_probabilities_by_hierarchy(df_obce, df_hierarchy, psc, "POU_KOD")
    )

    df_exploded = df_primary.explode("probabilities")
    df_exploded["code"] = df_exploded["probabilities"].apply(lambda x: x[0])
    df_exploded["prob"] = df_exploded["probabilities"].apply(lambda x: x[1])
    df_exploded = df_exploded.drop(columns=["probabilities"])

    df_grouped = df_exploded.groupby("code")["prob"].sum()
    print(df_grouped)

    df_grouped = df_exploded.groupby("code").agg(
        prob_sum=("prob", "sum"),  # Sum of probabilities
        min_psc=("PSČ", "min"),    # Minimum PSČ in the group
        max_psc=("PSČ", "max")     # Maximum PSČ in the group
    ).reset_index()
    print(df_grouped)

