#!/usr/bin/env python
# Copyright (c) TripleBlind Holdings, Inc. Confidential and Proprietary. All rights reserved.

from pathlib import Path

import numpy as np
import pandas as pd
from faker import Faker

import tripleblind as tb


tb.util.set_script_dir_current()
data_dir = Path("example_data")
data_dir.mkdir(exist_ok=True)

# Build Fake Data
fake = Faker()
Faker.seed(984)
np.random.seed(498)

# Create a population of fake individuals
num_students = 2000
fake_students = [
    {
        "name": fake.name(),
        "gmat": fake.random_int(500, 800),
        "gpa": round(np.random.uniform(2.5, 4.0), 1),
        "work_experience": fake.random_int(1, 8),
        "age": fake.random_int(22, 35),
        "admitted": fake.random_int(0, 2),
        "home_address": fake.address().replace("\n", "; "),
    }
    for x in range(num_students)
]


train_data = pd.DataFrame(fake_students)

# Add an outlier for use in the Outlier_Detection example
# Outlier is added to college_acceptance_b.csv
outlier = [
    {
        "name": "Otto Outlier",
        "gmat": 2000,
        "gpa": 4,
        "work_experience": 20,
        "age": 45,
        "admitted": 2,
        "home_address": fake.address().replace("\n", "; "),
    }
]

train_data = pd.concat([train_data, pd.DataFrame(outlier)], ignore_index=True)
train_data["id"] = range(len(train_data))

train_data.to_csv(data_dir / "college_acceptance.csv", index=False)

#
# Setup Classification Data
#

# Split the training data in half
base_data = pd.read_csv(data_dir / "college_acceptance.csv")
split = int(len(base_data) / 2)

data_a = base_data[:split]
data_b = base_data[split:]

# Save for later
data_a.to_csv(data_dir / "college_acceptance_a.csv", index=False)
data_b.to_csv(data_dir / "college_acceptance_b.csv", index=False)
