from convokit import Corpus, download
import os

corpus = Corpus(filename=download("reddit-corpus-small"))

# for convo in corpus.iter_conversations():

data_path = "surfaceweb_data/reddit"

try:
    os.mkdir(data_path)
except OSError:
    print("Directory already exists.")

reddit_convos = dict()

i = 0
for utt in corpus.iter_utterances():
    i += 1
    
    if utt.conversation_id in reddit_convos:
        reddit_convos[utt.conversation_id] += utt.text
    else:
        reddit_convos[utt.conversation_id] = utt.text

    # up to 80000 utterances
    if i == 80000:
        break

for convo_id, text in reddit_convos.items():

    if text:
        with open(os.path.join(data_path, convo_id), 'w') as f:
            f.write(text)

print("Dataset creation complete.")
