#!/usr/bin/env python
# Copyright (c) TripleBlind Holdings, Inc. Confidential and Proprietary. All rights reserved.

from pathlib import Path

import tripleblind as tb
from preprocessor.data_generator import MockType


##############################################################################
#
# Script to create college acceptance data.  Two datasets are created and
# placed on the access points for example_user1, and 2.  Permissions are
# set for training jobs to operate hands-free from the organization associated
# with example_user3.
#
##############################################################################

user1 = tb.config.example_user1
user2 = tb.config.example_user2

name1 = "EXAMPLE - NYU Student Admissions Data"
desc1 = """
![logo](https://upload.wikimedia.org/wikipedia/it/7/75/Nyu_logo2.gif)
This dataset contains fictional data about students that applied to NYU and
associated information about their admission status.

The NYU Admissions Student Data contains observations at the individual
student level. This data contains a categorical indicator of the student's
admittance status. This categorical variable indicates whether
the applicant was accepted, placed on a waiting list, or not admitted. See the
associated UMN Student Admissions dataset.
"""

name2 = "EXAMPLE - UMN Student Admissions Data"
desc2 = """
![logo](https://upload.wikimedia.org/wikipedia/commons/thumb/6/6a/University_of_Minnesota_Logo.svg/89px-University_of_Minnesota_Logo.svg.png)
This dataset contains fictional data about students that applied to UMN and
associated information about their admission status.

The College Acceptance dataset contains observations at the individual
student level. This data contains a categorical indicator of the student's
admittance status. This categorical variable indicates whether
the applicant was accepted, placed on a waiting list, or not admitted. See the
associated NYU Student Admissions dataset.
"""

tb.util.set_script_dir_current()
tb.initialize(api_token=user1["token"])
data_dir = Path("example_data")

#############################################################################
#  Organization one
#############################################################################

try:
    session1 = tb.Session(api_token=user1["token"], from_default=True)
    package_data_a = tb.Package.create(
        filename=data_dir / "college_acceptance_a.zip",
        record_data=data_dir / "college_acceptance_a.csv",
    )
    print(f"Creating dataset '{name1}' on {user1['login']}'s Access Point...")
    dataset1 = tb.TableAsset.position(
        file_handle=package_data_a,
        name=name1,
        desc=desc1,
        is_discoverable=True,
        session=session1,
    )

except tb.TripleblindAssetAlreadyExists:
    print(f"   asset '{name1}' already exists, skipping.")
    dataset1 = tb.TableAsset.find(name1, owned_by=tb.config.example_user1["team_id"])

# Attach an Agreement to the dataset to allow anyone to train against
# this dataset without further interaction.
dataset1.add_agreement(
    with_team="ANY",
    operation=tb.Operation.RANDOM_FOREST_TRAIN,
    session=session1,
)
print("Created Agreement for any to train random forest models against this dataset.\n")


# The SSN, Name, and home_phone column names are auto-detected to appropriate
# data masking types, but "addr" isn't and needs to be explicitly configured to
# output samples using address masking.
dataset1.mask_columns(col_names="name", mask_type=MockType.name)
dataset1.mask_columns(col_names="home_address", mask_type=MockType.address)

# By default all values are masked, but the data owner has determined that
# these values in and of themselves aren't sensitive and can be safely viewed
# without other identifying values. Unmask them.
dataset1.unmask_columns(col_names=["gmat", "gpa", "work_experience", "age", "admitted"])

dataset1.add_agreement(
    with_team=tb.config.example_user3["team_id"], operation=tb.Operation.BLIND_SAMPLE
)

#############################################################################
#  Organization two
#############################################################################

try:
    session2 = tb.Session(api_token=user2["token"], from_default=True)
    package_data_b = tb.Package.create(
        filename=data_dir / "college_acceptance_b.zip",
        record_data=data_dir / "college_acceptance_b.csv",
    )
    print(f"Creating dataset '{name2}' on {user2['login']}'s Access Point...")
    dataset2 = tb.TableAsset.position(
        file_handle=package_data_b,
        name=name2,
        desc=desc2,
        is_discoverable=True,
        session=session2,
    )
except tb.TripleblindAssetAlreadyExists:
    print(f"   asset '{name2}' already exists, skipping.")
    dataset2 = tb.TableAsset.find(name2, owned_by=tb.config.example_user2["team_id"])

# Attach an Agreement to the dataset to allow anyone to train against
# this dataset without further interaction.
dataset2.add_agreement(
    with_team="ANY",
    operation=tb.Operation.RANDOM_FOREST_TRAIN,
    session=session2,
)
dataset2.add_agreement(
    with_team="ANY",
    operation=tb.Operation.OUTLIER_DETECTION,
    session=session2,
)
print(
    "Created Agreement for any to train random forest models and detect outliers against this dataset.\n"
)


print("Data is in position.")
