#!/usr/bin/env python
# Copyright (c) TripleBlind Holdings, Inc. Confidential and Proprietary. All rights reserved.

import os
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

import tripleblind as tb


# # Loading the CSV with pandas
tb.util.set_script_dir_current()
data_dir = Path("example_data")
train_data = tb.util.download_tripleblind_resource(
    "santander-customer-transaction-prediction_train.csv",
    save_to_dir=data_dir,
    cache_dir="../../.cache",
)

print("Creating datasets...")
df = pd.read_csv(train_data)


#########################################################################
# Process the file for several reasons:
# 1. To format as needed for model building (scaled, numerical data)
# 2. To produce a single file for local inference/testing (20,000 random rows)
# 3. To divide remaining data into 3 simulated bank files
#    (i.e. 3 different banks SAN, JPM, PNB)
# 4. To format the data for uploading to separate Access Points and provide
#    exposure to TripleBlind concepts and tools.

#########################################################################
#  PRE-PROCESS FOR FORMAT

# Drop the ID column since it isn't needed for this analysis

ids = df["ID_code"].copy()
del df["ID_code"]


##################
# Scale the features, but not the target variable
#
# save the target value
y = df["target"].values.astype(np.float32)
# drop the target column so it doesn't get scaled
df = df.drop(labels=["target"], axis=1)
#
# save the column names
cols = df.columns
#
## Scale the data
scaler = StandardScaler()
df = scaler.fit_transform(df)
#
# put back into a dataframe
df = pd.DataFrame(df, columns=cols)
# add back the target value
df.insert(0, "target", y)
df.insert(0, "ID_code", ids)

if os.environ.get("TB_TEST_SMALL"):
    df = df[:100]

###########################################################################
# Split the data into parts
#
# First split is to create a local "test" set with 50 labeled rows
TEST_SIZE = 0.1
mask = np.random.rand(len(df)) < (1 - TEST_SIZE)

# Create training data
df_train = df[mask]
train_ids = df_train["ID_code"]
train_y = df_train["target"]
del df_train["ID_code"]
del df_train["target"]

train_0_40 = df_train.iloc[:, :40].copy()
train_0_40["ID_code"] = train_ids
train_0_40["target"] = train_y
train_0_40 = train_0_40[:10]
train_0_40.to_csv(data_dir / "sant_psi_vertical_0_40_train.csv", index=False)

train_41_100 = df_train.iloc[:, 40:100].copy()
train_41_100["ID_code"] = train_ids
train_41_100.to_csv(data_dir / "sant_psi_vertical_41_100_train.csv", index=False)

train_101_200 = df_train.iloc[:, 100:].copy()
train_101_200["ID_code"] = train_ids
train_101_200.to_csv(data_dir / "sant_psi_vertical_101_200_train.csv", index=False)

# Create testing data
df_test = df[~mask]
df_test.to_csv(data_dir / "sant_psi_vertical_test.csv", index=False)
