#!/usr/bin/env python
# Copyright (c) TripleBlind Holdings, Inc. Confidential and Proprietary. All rights reserved.

from pathlib import Path

import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split

import tripleblind as tb


tb.util.set_script_dir_current()
data_dir = Path("example_data")
data_dir.mkdir(exist_ok=True)


def create_datasets(n_samples, n_outliers, features, data_suffix):
    X, y, coef = datasets.make_regression(
        n_samples=n_samples,
        n_features=features,
        n_informative=int(features * 0.60),
        noise=int(features * 0.10),
        coef=True,
        random_state=0,
    )

    # Add outlier data
    np.random.seed(0)
    X[:n_outliers] = 3 + 0.5 * np.random.normal(size=(n_outliers, 1))
    y[:n_outliers] = -3 + 10 * np.random.normal(size=n_outliers)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=0
    )

    full_reg_train = pd.DataFrame(X_train)
    full_reg_train["target"] = y_train
    full_reg_train.columns = [
        f"_{col}" if isinstance(col, int) or col.isnumeric() else col
        for col in full_reg_train.columns
    ]
    full_reg_train.to_csv(
        data_dir / f"full_reg_random_train{data_suffix}.csv", index=False
    )

    full_reg_test = pd.DataFrame(X_test)
    full_reg_test["target"] = y_test
    full_reg_test.columns = [
        f"_{col}" if isinstance(col, int) or col.isnumeric() else col
        for col in full_reg_test.columns
    ]
    full_reg_test.to_csv(data_dir / f"regression_test{data_suffix}.csv", index=False)

    # Create uneven splits

    split_index = int(len(X_train) * 0.75)

    split_0_frame = pd.DataFrame(X_train[:split_index])
    split_0_frame["target"] = y_train[:split_index]
    split_0_frame.columns = [
        f"_{col}" if isinstance(col, int) or col.isnumeric() else col
        for col in split_0_frame.columns
    ]
    split_0_frame.to_csv(
        data_dir / f"split_0_reg_random_train{data_suffix}.csv", index=False
    )

    split_1_frame = pd.DataFrame(X_train[split_index:])
    split_1_frame["target"] = y_train[split_index:]
    # Rename each column if it is a number only to _#
    split_1_frame.columns = [
        f"_{col}" if isinstance(col, int) or col.isnumeric() else col
        for col in split_1_frame.columns
    ]
    split_1_frame.to_csv(
        data_dir / f"split_1_reg_random_train{data_suffix}.csv", index=False
    )


# Create regular sized datasets
create_datasets(
    n_samples=100_000,
    n_outliers=5_000,
    features=10,
    data_suffix="",
)

# Create small datasets for rapid training/testing
create_datasets(
    n_samples=100,
    n_outliers=5,
    features=10,
    data_suffix="_small",
)
