#!/usr/bin/env python
# Copyright (c) TripleBlind Holdings, Inc. Confidential and Proprietary. All rights reserved.

import numpy as np

import tripleblind as tb


tb.initialize(api_token=tb.config.example_user1["token"], example=True)

trips = tb.TableAsset.find(
    "EXAMPLE - Transport Data", owned_by=tb.config.example_user2["team_id"]
)
purchases = tb.TableAsset.find(
    "EXAMPLE - Shop Transactions", owned_by=tb.config.example_user1["team_id"]
)

# Perform a Private Set Intersection between the two databases by "address"
# return the address from your database, plus the depart and arrive stations
# from theirs.
overlap = purchases.blind_join(
    intersect_with=trips,
    match_column=["address", "customer_address"],
    # return_columns=["address", "depart_station", "arrive_station"],
    return_columns=[["address"], ["depart_station", "arrive_station"]],
    silent=False,
    join_type=tb.JoinType.INNER_PARTITIONED,
)

overlap.dataframe.to_csv("overlap_exact.csv", index=False)
if overlap.dataframe.empty:
    print("Blind Join returned no intersecting records.")
else:
    print(
        "Number of common customers: ",
        len(overlap.dataframe.replace("", np.nan)["address"].dropna().value_counts()),
    )


# Perform a "fuzzy" intersection between the two databases using the "address"
# addresses which are common between the two databases.
overlap_fuzzy = purchases.blind_join(
    intersect_with=trips,
    match_column=["address", "customer_address"],
    # return_columns=["address", "depart_station", "arrive_station"],
    return_columns=[["address"], ["depart_station", "arrive_station"]],
    match_fuzziness=0.3,
    silent=False,
    join_type=tb.JoinType.INNER_PARTITIONED,
)

if overlap_fuzzy:
    if overlap_fuzzy.dataframe.empty:
        print("Blind Join returned no intersecting records.")
    else:
        print(
            "Number of common customers (fuzzy matched): ",
            len(
                overlap_fuzzy.dataframe.replace("", np.nan)["address"]
                .dropna()
                .value_counts()
            ),
        )
    overlap_fuzzy.retrieve("overlap_fuzzy.zip", overwrite=True)
    print("Results saved as 'overlap_fuzzy.zip'")
else:
    # No overlap found during the Join
    print("Failed to find overlap.")
