#!/usr/bin/env python
# Copyright (c) TripleBlind Holdings, Inc. Confidential and Proprietary. All rights reserved.

from pathlib import Path

import pandas as pd
import sklearn.datasets as datasets

import tripleblind as tb


tb.util.set_script_dir_current()
data_dir = Path("example_data")
data_dir.mkdir(exist_ok=True)

#
# Setup Regression Data
#

cali_data = datasets.fetch_california_housing()
cali_df = pd.DataFrame(cali_data.data, columns=cali_data.feature_names)
cali_df["Price"] = cali_data.target

cali_test = cali_df.sample(10).drop("Price", axis=1)
cali_df = cali_df.drop(cali_test.index)

cali_df_a = cali_df.sample(int(len(cali_df) / 2))
cali_df_b = cali_df.drop(cali_df_a.index)

cali_df_a.to_csv(data_dir / "cali_housing_a.csv", index=False)
cali_df_b.to_csv(data_dir / "cali_housing_b.csv", index=False)
cali_test.to_csv(data_dir / "cali_housing_test.csv", index=False)
