#!/usr/bin/env python
# Copyright (c) TripleBlind Holdings, Inc. Confidential and Proprietary. All rights reserved.

from pathlib import Path

import pandas as pd
from sklearn.model_selection import train_test_split

import tripleblind as tb


# Loading the CSV with pandas
tb.util.set_script_dir_current()
data_dir = Path("example_data")
train_data = tb.util.download_tripleblind_resource(
    "ratings_small.csv",
    save_to_dir=data_dir,
    cache_dir="../../.cache",
)

df = pd.read_csv(train_data)

# Shuffle data and split into train/test splits
train, test = train_test_split(df, test_size=0.001)

# Split training data to distribute to separate clients
train0 = train[: int(len(train) * 0.5)]
train1 = train[int(len(train) * 0.5) :]

train0.to_csv(data_dir / "ratings0.csv", index=False)
train1.to_csv(data_dir / "ratings1.csv", index=False)

# Retain userId for column and save for later inference
test_df = pd.DataFrame(test["userId"], columns=["userId"])
test_df.to_csv(data_dir / "ratings_test.csv", index=False)
