#!/usr/bin/env python
# Copyright (c) TripleBlind Holdings, Inc. Confidential and Proprietary. All rights reserved.

from tripleblind.table_asset import StatFunc

import tripleblind as tb


tb.initialize(api_token=tb.config.example_user3["token"], example=True)


table1 = tb.TableAsset.find(
    "EXAMPLE - Basic Statistics (part 1)", owned_by=tb.config.example_user1["team_id"]
)
table2 = tb.TableAsset.find(
    "EXAMPLE - Basic Statistics (part 2)", owned_by=tb.config.example_user2["team_id"]
)
table3 = tb.TableAsset.find(
    "EXAMPLE - Basic Statistics (part 3)", owned_by=tb.config.example_user3["team_id"]
)

# Calculate a variety of statistics on the "value" column against the three
# distributed datasets.  If there were differently named in the different
# datasets, you could also enumerate the unique names using:
#    column=["value1", "value2", "value3"]
#
result = table3.get_statistics(
    column="value",
    function=[
        StatFunc.MAXIMUM,
        StatFunc.MINIMUM,
        StatFunc.MEDIAN,
        StatFunc.QUARTILES,
        StatFunc.MEAN,
        StatFunc.VARIANCE,
        StatFunc.STANDARD_DEVIATION,
        StatFunc.SKEW,
        StatFunc.KURTOSIS,
        StatFunc.COUNT,
        StatFunc.CONFIDENCE_INTERVAL,
        StatFunc.STANDARD_ERROR,
    ],
    combine_with=[table1, table2],
)
if not result:
    raise SystemError("Statistic calculation failed.")


print("\n=================================")
print("Overall statistics\n")
print(result.dataframe)
print("\n\n")

# Calculate statistics again, grouping on the "sex" field.
# NOTE: Since no specific function= is specified, all are calculated.
result_grouped = table3.get_statistics(
    "value",
    combine_with=[table1, table2],
    group_by="sex",
)
print("\n=================================")
print("Statistics grouped by 'sex'\n")
print(result_grouped.dataframe)
print("\n\n")


# Calculate statistics again, grouping on the "ethnicity" field.
result_ethnicity = table3.get_statistics(
    "value",
    combine_with=[table1, table2],
    group_by="ethnicity",
)
if not result_ethnicity:
    print("\nFailed to report by ethnicity due to k-grouping setting -- as expected.")
print("\n\n")


# Drop the table3, which is the only one with k-grouping of 5.  The other two
# data owners only require k-grouping of 3, so results can be viewed when table3
# isn't included.
result_ethnicity = table2.get_statistics(
    "value",
    combine_with=[table1],
    group_by="ethnicity",
)
print("\n=================================")
print("Statistics grouped by 'ethnicity'\n")
print(result_ethnicity.dataframe)
