#!/usr/bin/env python
# Copyright (c) TripleBlind Holdings, Inc. Confidential and Proprietary. All rights reserved.

from pathlib import Path

import tripleblind as tb


tb.util.set_script_dir_current()
data_dir = Path("example_data")
prefix = "EXAMPLE - "
name0 = "Gene Training Data 0"
name1 = "Gene Training Data 1"

##########################################################################
# GET AUTHENTICATION TOKENS AND ESTABLISH CONNECTION TO THE MARKETPLACE
#
# Establish the connection details to reach the TripleBlind market instance.
# Unless explicitly specified, all operations will occur via this default
# session as the user 'organization_one'
tb.initialize(api_token=tb.config.example_user1["token"])

# Register the first training database with the Marketplace and place on AP
try:
    # Start a connection as 'organization_two' user
    session2 = tb.Session(api_token=tb.config.example_user2["token"], from_default=True)

    print(
        f"Creating dataset '{prefix}{name0}' on {tb.config.example_user2['login']}'s Access Point..."
    )
    dataset_train0 = tb.asset.CSVDataset.position(
        file_handle=data_dir / "train0.csv",
        name=f"{prefix}{name0}",
        desc="""The first half of the gene expression data published by Golub et
        al in "Molecular Classification of Cancer: Class Discovery and Class Prediction
        by Gene Expression Monitoring" in 1999.  These datasets contain measurements
        corresponding to ALL and AML samples from Bone Marrow and Peripheral Blood.
        Intensity values have been re-scaled such that overall intensities for each
        chip are equivalent.

See the original data and more details on Kaggle at: https://www.kaggle.com/code/varimp/gene-expression-classification/notebook.""",
        is_discoverable=True,
        session=session2,
        auto_rename_columns=True,
    )

    if tb.config.create_agreements_on_example_input_assets:
        # For this example we will attach an Agreement to the datasets.  This
        # agreement makes the dataset available to the other organization,
        # meaning the training step will not require the dataset owner to
        # explicitly grant permission when it is used in the next step.
        dataset_train0.add_agreement(
            with_team=tb.config.example_user1["team_id"],
            operation=tb.Operation.REGRESSION,
            session=session2,
        )
        print("Created Agreement")
except tb.TripleblindAssetAlreadyExists:
    print(f"Asset '{prefix}{name0}' already exists'")


# Register the second training database with the Marketplace and place on AP
try:
    # Start a connection as 'organization_three' user
    session3 = tb.Session(api_token=tb.config.example_user3["token"], from_default=True)

    print(
        f"Creating dataset '{prefix}{name1}' on {tb.config.example_user3['login']}'s Access Point..."
    )
    dataset_train1 = tb.asset.CSVDataset.position(
        file_handle=data_dir / "train1.csv",
        name=f"{prefix}{name1}",
        desc="""The second half of the gene expression data published by Golub et
        al in "Molecular Classification of Cancer: Class Discovery and Class Prediction
        by Gene Expression Monitoring" in 1999.  These datasets contain measurements
        corresponding to ALL and AML samples from Bone Marrow and Peripheral Blood.
        Intensity values have been re-scaled such that overall intensities for each
        chip are equivalent.

See the original data and more details on Kaggle at: https://www.kaggle.com/code/varimp/gene-expression-classification/notebook.""",
        is_discoverable=True,
        session=session3,
        auto_rename_columns=True,
    )

    if tb.config.create_agreements_on_example_input_assets:
        dataset_train1.add_agreement(
            with_team=tb.config.example_user1["team_id"],
            operation=tb.Operation.REGRESSION,
            session=session3,
        )
        print("Created Agreement")
except tb.TripleblindAssetAlreadyExists:
    print(f"Asset '{prefix}{name1}' already exists.'")

# Cache test dataset on org2 for later use
try:
    name = "EXAMPLE - Test data - gene regression"
    print(f"Uploading '{name}' to organization-two's Access Point...")
    test_asset_0 = tb.asset.CSVDataset.position(
        data_dir / "test.csv",
        desc="A set of test data used during inference",
        name=name,
        is_discoverable=False,
        session=session2,
        auto_rename_columns=True,
    )

except tb.TripleblindAssetAlreadyExists:
    print(f"Asset '{name}' already exists.'")

print("Data is in position.")
