#!/usr/bin/env python
# Copyright (c) TripleBlind Holdings, Inc. Confidential and Proprietary. All rights reserved.

import os
import random
import shutil
import tarfile
import tempfile
from pathlib import Path

import pandas as pd
import requests


print("Downloading Data.")
data = requests.get(
    "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
    allow_redirects=True,
)

print("Unzipping to data directory...")
with tempfile.NamedTemporaryFile("wb", delete=False) as tmp_tar:
    tmp_tar.write(data.content)

with tarfile.open(tmp_tar.name) as f:
    f.extractall("./rating_data/")
    f.close()
os.unlink(tmp_tar.name)

print("Preprocessing data.")
split_dir = Path("rating_data/aclImdb/train")
texts = []
labels = []
for label_dir in ["pos", "neg"]:
    for text_file in (split_dir / label_dir).iterdir():
        texts.append(text_file.read_text())
        labels.append(0 if label_dir == "neg" else 1)

print("Removing temporary files.")
shutil.rmtree("./rating_data")

print("Creating full csv dataset for future use.")
full_df = pd.DataFrame({"comments": texts, "ratings": labels})
full_df.to_csv("movie_ratings.csv", index=False)

print("Creating new training datasets.")
print("Disclaimer: Only 40 samples are used for training for purposes of this demo.")
print(
    "On larger datasets, it is recommended to utilize GPU support and larger memory instances."
)

indices = list(range(len(texts)))
random.shuffle(indices)
texts = [texts[i] for i in indices]
labels = [labels[i] for i in indices]

df = pd.DataFrame({"comments": texts[:40], "ratings": labels[:40]})
df.to_csv("movie_rating_train.csv", index=False)
