#!/usr/bin/env python
# Copyright (c) TripleBlind Holdings, Inc. Confidential and Proprietary. All rights reserved.

import pandas as pd

import tripleblind as tb


# In this example the retailer is running report and the transit company
# will be asked for permission to access their data.
tb.initialize(api_token=tb.config.example_user1["token"], example=True)

trips = tb.TableAsset.find(
    "EXAMPLE - Transport Data", owned_by=tb.config.example_user2["team_id"]
)
purchases = tb.TableAsset.find(
    "EXAMPLE - Shop Transactions", owned_by=tb.config.example_user1["team_id"]
)

# Find common customers between the two databases by matching on address.
# NOTE: This will throw an exception if the "depart_station" and
#       "arrive_station" fields haven't been unmasked.
overlap = purchases.blind_join(
    intersect_with=trips,
    match_column=["address", "customer_address"],
    match_fuzziness=0.3,
    return_columns=["depart_station", "arrive_station"],
    silent=False,
    join_type=tb.JoinType.INNER_PARTITIONED,
)

if overlap:
    df = overlap.dataframe
    if df.empty:
        print("Blind Join returned no intersecting records.")
    else:
        counts = pd.concat([df["arrive_station"], df["depart_station"]]).value_counts()

        # Sort and limit to the top 10
        top10 = counts.sort_values(ascending=False)[:10]
        print()
        print("  Top 10 stations")
        print("station_id    # visits")
        print("----------    --------")
        for id, cnt in top10.iteritems():
            print(f"{id:10}       {cnt}")

    # Save values for inspection/validation
    overlap.retrieve("join_inner_partitioned.zip", overwrite=True)
    print("Results saved as 'join_inner_partitioned.zip'")
else:
    # No overlap found during the Join
    print("Failed to find overlap.")
