#!/usr/bin/env python
# Copyright (c) TripleBlind Holdings, Inc. Confidential and Proprietary. All rights reserved.

import random
import time
from pathlib import Path

import pandas as pd
from faker import Faker

import tripleblind as tb


fake = Faker()
Faker.seed(0)
tb.util.set_script_dir_current()
data_dir = Path("example_data")
data_dir.mkdir(exist_ok=True)

# Create a population of fake individuals
num_people = 10
population = [
    [fake.name(), fake.address().replace("\n", "; ")] for i in range(num_people)
]

# Build an imaginary database for a retail store's transaction data.
#
# Each row is a transaction containing:
#   price, name, address, date

num_retail_transactions = 50
retail_customers = 0.5  # percentage of population who shops at this store

# Assume the top portion of the population shops at this store
total_retail_population = int(len(population) * retail_customers)
first_customer = 0
last_customer = total_retail_population - 1

transaction_data = []
while len(transaction_data) < num_retail_transactions:
    price = random.randint(25, 2500) / 100.0
    date = time.time()
    name, address = population[random.randint(first_customer, last_customer)]

    transaction_data.append(
        {"price": price, "date": date, "name": name, "address": address}
    )
df2 = pd.DataFrame(transaction_data)
df2.to_csv(data_dir / "store_transactions.csv", index=False)

# Build an imaginary database of a transit system's passenger information.
#
# Each database row is a single trip with the values:
#   departure_station, departure_date, arrival_station, arrival_date, name, address
num_stations = 100
num_trips = 500
transport_customers = 0.5  # percentage of population who uses transit

# Assume the middle portion of the population list uses transit
total_passenger_population = int(len(population) * transport_customers)
first_passenger = (len(population) - total_passenger_population) // 2
last_passenger = (first_passenger + total_passenger_population) - 1

transport_data = []
while len(transport_data) < num_trips:
    orig = random.randint(0, num_stations - 1)
    while True:
        dest = random.randint(0, num_stations - 1)
        if orig != dest:
            break

    depart_from = f"station_{orig}"
    depart_date = time.time()
    arrive_at = f"station_{dest}"
    arrive_date = depart_date + abs(orig - dest) * 100

    name, address = population[random.randint(first_passenger, last_passenger)]

    # Munge the address information for customers so it is a little different
    # than what is in the retailer's database
    address = address.replace("Apt.", "Apartment")
    address = address.replace("Suite", "Ste")

    transport_data.append(
        {
            "depart_station": depart_from,
            "depart_date": depart_date,
            "arrive_station": arrive_at,
            "arrive_date": arrive_date,
            "customer_name": name,
            "customer_address": address,
        }
    )

df = pd.DataFrame(transport_data)
df.to_csv(data_dir / "passengers.csv", index=False)
