#!/usr/bin/env python
# Copyright (c) TripleBlind Holdings, Inc. Confidential and Proprietary. All rights reserved.

import os

import tripleblind as tb


tb.initialize(api_token=tb.config.example_user3["token"], example=True)

# This example uses a dataset belonging to another organization which has
# already given permission for the asset to be used in a linkage.
prefix = "TEST" if "TB_TEST_SMALL" in os.environ else "EXAMPLE"
their_data = tb.TableAsset.find(
    f"{prefix} - Private Record Linkage",
    owned_by=tb.config.example_user2["team_id"],
)

# This example includes several different datasets you can use to get
# comfortable with the linkage process.  Uncomment the search data you want to
# use, you can examine or alter the contents as you experiment.
#
# my_data = "exact_matches.csv"  # identical records are in the database
# my_data = "missing_data.csv"  # records with some missing data
my_data = "near_matches.csv"  # slightly different records are in the database
# my_data = "moved.csv"  # individuals who who moved to a new address
# my_data = "new_phone.csv"  # individuals who changed their phone number
# my_data = "difficult.csv"  # married couple with similar names and identical data


# The match_columns are used to establish links between the two different
# datasets.  Note that the column names do not have to be identical, pairs of
# names can be used to link columns with different names.
table = tb.TableAsset.find_linked_records(
    datasets=[my_data, their_data],
    match_columns=[
        ("LAST", "LAST_NAME"),
        ("FIRST", "FIRST_NAME"),
        ("MIDDLE", "MIDL_NAME"),
        "HOUSE_NUM",
        "STREET_NAME",
        "STREET_TYPE_CD",
        ("ZIP", "ZIP_CODE"),
        ("PHONE", "PHONE_NUM"),
        ("GENDER", "SEX_CODE"),
        "AGE",
    ],
    # The match_threshold is optional, the default of 0.88 works well typically
    match_threshold=0.88,
)

# Save results for later examination
table.dataframe.to_csv("result.csv", index=False)

print("Linkage results:")
if table.dataframe.count()[0] < 2:
    raise SystemExit("    No matches found.")


print("The top half of these results are the search records found in the other")
print("party's dataset.  To help in understanding the match process, in this")
print("example the bottom half of the results is the record each search record")
print("was matched with.  The 'cosine_similarity' column is a measure of how similar")
print("the records are to each other.\n")
print()
print("In a real-world use case, the bottom half of the results would not be")
print("revealed to the search party.  It is only being exposed in this protocol")
print("for the purpose of demonstrating the linkage process.")
print()
print(table.dataframe)
print(f"\n\nNOTE: Results can be found in asset {table.uuid} or result.csv")
