#!/usr/bin/env python
# Copyright (c) TripleBlind Holdings, Inc. Confidential and Proprietary. All rights reserved.

from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import tripleblind as tb


# Loading the CSV with pandas
tb.util.set_script_dir_current()
data_dir = Path("example_data")
train_data = tb.util.download_tripleblind_resource(
    "santander-customer-transaction-prediction_train.csv",
    save_to_dir=data_dir,
    cache_dir="../../../.cache",
)

print("Creating datasets...")
df = pd.read_csv(train_data)

# Cut down dataset so that it only has 26 features excluding target and id_code
df = df.iloc[:, :28]
# Rename variables
df.rename(
    columns={
        "var_0": "sal",
        "var_1": "reemb",
        "var_2": "amt",
        "var_3": "svr",
        "var_4": "recib",
        "var_5": "tar",
        "var_6": "ind",
        "var_7": "ind_larg",
        "var_8": "ind_med",
        "var_9": "ind_cort",
        "var_10": "ind_emit",
        "var_11": "ind_recib",
        "var_12": "sal_q1",
        "var_13": "sal_q2",
        "var_14": "sal_q3",
        "var_15": "sal_q4",
        "var_16": "imp_amort",
        "var_17": "imp_venta",
        "var_18": "imp_sal",
        "var_19": "imp_op",
        "var_20": "imp_aport",
        "var_21": "saldo",
        "var_22": "base",
        "var_23": "FoR",
        "var_24": "num_med",
        "var_25": "mes_med",
    },
    inplace=True,
)

#########################################################################
# Process the file for several reasons:
# 1. To format as needed for model building (scaled, numerical data)
# 2. To produce a single file for local inference/testing (20,000 random rows)
# 3. To divide remaining data into 3 simulated bank files
#    (i.e. 3 different banks SAN, JPM, PNB)
# 4. To format the data for uploading to separate Access Points and provide
#    exposure to TripleBlind concepts and tools.

#########################################################################
#  PRE-PROCESS FOR FORMAT

# Drop the ID column since it isn't needed for this analysis
df.drop(["ID_code"], axis=1, inplace=True)


##################
# Scale the features, but not the target variable
#
# save the target value
y = df["target"].values.astype(np.float32)
# drop the target column so it doesn't get scaled
df = df.drop(labels=["target"], axis=1)
#
# save the column names
cols = df.columns
#
## Scale the data
scaler = StandardScaler()
df = scaler.fit_transform(df)
#
# put back into a dataframe
df = pd.DataFrame(df, columns=cols)
# add back the target value
df.insert(0, "target", y)

###########################################################################
# Split the data into parts
#
# 90% of the data is for training, 10% is reserved for testing
df_train, df_test = train_test_split(df, test_size=0.1, random_state=101)

# Save the test data batch
df_test.to_csv(data_dir / "test.csv", index=False)

# Save a smaller (50 record) test data batch
df_test[:50].to_csv(data_dir / "test_small.csv", index=False)

# For convenience, also split the X and y values to separate csv files
# (using data science convention where X is the data, y is the target)
X = df_test.copy()[:50]
y = X["target"].copy()
del X["target"]

X.to_csv(data_dir / "test_small_X.csv", index=False)
y.to_csv(data_dir / "test_small_target.csv", header=["target"], index=False)

###########################################################################
#
# Second split is to create a local "test" set with 10% of the labeled data
# df_train, df_test = train_test_split(df, test_size=0.1, random_state=101)
# save data for testing.

###########################################################################
# Create 3 different training data files - one for each simulated company

san_start, san_end = 0, 60000
jpm_start, jpm_end = 60001, 120000
pnb_start, pnb_end = 120001, 180000
df_train[san_start:san_end].to_csv(data_dir / "SAN.csv", index=False)
df_train[jpm_start:jpm_end].to_csv(data_dir / "JPM.csv", index=False)
df_train[pnb_start:pnb_end].to_csv(data_dir / "PNB.csv", index=False)

san_start, san_end = 0, 6
jpm_start, jpm_end = 7, 13
pnb_start, pnb_end = 14, 20
df_train[san_start:san_end].to_csv(data_dir / "SAN_small.csv", index=False)
df_train[jpm_start:jpm_end].to_csv(data_dir / "JPM_small.csv", index=False)
df_train[pnb_start:pnb_end].to_csv(data_dir / "PNB_small.csv", index=False)

print("Data is ready to be uploaded to Access Points.")
