#!/usr/bin/env python
# Copyright (c) TripleBlind Holdings, Inc. Confidential and Proprietary. All rights reserved.

from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import tripleblind as tb


# Loading the CSV with pandas
tb.util.set_script_dir_current()
data_dir = Path("example_data")
train_data = tb.util.download_tripleblind_resource(
    "santander-customer-transaction-prediction_train.csv",
    save_to_dir=data_dir,
    cache_dir="../../../.cache",
)

print("Creating datasets...")
df = pd.read_csv(train_data)

# Cut down dataset so that it only has 26 features excluding target and id_code
df = df.iloc[:, :28]
# Rename variables to make them healthcare-ish
df.rename(
    columns={
        "var_0": "Stay ID",
        "var_1": "ICD Code",
        "var_2": "Stay Duration",
        "var_3": "Charge",
        "var_4": "Age",
        "var_5": "Height",
        "var_6": "Weight",
        "var_7": "TempF",
        "var_8": "Pulse",
        "var_9": "Resprate",
        "var_10": "BPSys",
        "var_11": "BPDias",
        "var_12": "O2 Sat",
        "var_13": "Haemoglobins",
        "var_14": "Erythrocyte",
        "var_15": "Hematocrit",
        "var_16": "BG",
        "var_17": "Thrombocyte",
        "var_18": "MCH",
        "var_19": "#Outpatient Vists",
        "var_20": "#ICU Stays",
        "var_21": "#Inpatient Stays",
        "var_22": "#Mental Trauma Screening",
        "var_23": "#Law Enforcement Encounter",
        "var_24": "MCHC",
        "var_25": "MCV",
    },
    inplace=True,
)

#########################################################################
# Process the file for several reasons:
# 1. To format as needed for model building (scaled, numerical data)
# 2. To produce a single file for local inference/testing (20,000 random rows)
# 3. To divide remaining data into 3 simulated bank files
#    (i.e. 3 different banks SAN, JPM, PNB)
# 4. To format the data for uploading to separate Access Points and provide
#    exposure to TripleBlind concepts and tools.

#########################################################################
#  PRE-PROCESS FOR FORMAT

# Drop the ID column since it isn't needed for this analysis
df.drop(["ID_code"], axis=1, inplace=True)


##################
# Scale the features, but not the target variable
#
# save the target value
y = df["target"].values.astype(np.float32)
# drop the target column so it doesn't get scaled
df = df.drop(labels=["target"], axis=1)
#
# save the column names
cols = df.columns
#
## Scale the data
scaler = StandardScaler()
df = scaler.fit_transform(df)
#
# put back into a dataframe
df = pd.DataFrame(df, columns=cols)
# add back the target value
df.insert(0, "target", y)

###########################################################################
# Split the data into parts
#
# 90% of the data is for training, 10% is reserved for testing
df_train, df_test = train_test_split(df, test_size=0.1, random_state=101)

# Save the test data batch
df_test.to_csv(data_dir / "test.csv", index=False)

# Save a smaller (50 record) test data batch
df_test[:50].to_csv(data_dir / "test_small.csv", index=False)

# For convenience, also split the X and y values to separate csv files
# (using data science convention where X is the data, y is the target)
X = df_test.copy()[:50]
y = X["target"].copy()
del X["target"]

X.to_csv(data_dir / "test_small_X.csv", index=False)
y.to_csv(data_dir / "test_small_target.csv", header=["target"], index=False)

###########################################################################
#
# Second split is to create a local "test" set with 10% of the labeled data
# df_train, df_test = train_test_split(df, test_size=0.1, random_state=101)
# save data for testing.

###########################################################################
# Create 3 different training data files - one for each simulated company

san_start, san_end = 0, 60000
jpm_start, jpm_end = 60001, 120000
pnb_start, pnb_end = 120001, 180000
df_train[san_start:san_end].to_csv(data_dir / "Hosp1.csv", index=False)
df_train[jpm_start:jpm_end].to_csv(data_dir / "Hosp2.csv", index=False)
df_train[pnb_start:pnb_end].to_csv(data_dir / "Hosp3.csv", index=False)

san_start, san_end = 0, 6
jpm_start, jpm_end = 7, 13
pnb_start, pnb_end = 14, 20
df_train[san_start:san_end].to_csv(data_dir / "Hosp1_small.csv", index=False)
df_train[jpm_start:jpm_end].to_csv(data_dir / "Hosp2_small.csv", index=False)
df_train[pnb_start:pnb_end].to_csv(data_dir / "Hosp3_small.csv", index=False)

print("Data is ready to be uploaded to Access Points.")
