# All transcripts cleaning 
# Run individually for control and aphasia data

import re
import os

# Define the directory path
directory_path = "./aphasia"

# Iterate over all files in the directory
for filename in os.listdir(directory_path):
    # Check if the file is a regular file (not a directory or symlink)
    if os.path.isfile(os.path.join(directory_path, filename)):
        # Open the file and read its contents
        with open(os.path.join(directory_path, filename), "r", encoding="utf-8") as file:
            transcript = file.read()
            file.close()      
            
            # Extract task transcript
            # @G: Window
            task = re.findall(r'\@G:\s+Window\n(([^\@].*\n)*)', transcript)
            
            if task != []:
                
                bw = list(task[0])
                
                # Join extracted task transcript 
                bw_script = ''.join(bw)

                # Remove NAKs
                bw_script = re.sub(r".*", '', bw_script)

                # Extract Participant transcript
                par_pattern = r'\*PAR:[^\%]*'
                matches = re.findall(par_pattern, bw_script)

                with open('par_transcript.txt', 'w', encoding="utf-8") as file:
                    for match in matches:
                        file.write(match)


                # ----------------------------------- Pre-process Participant transcript -------------------------------


                # Read participant transcript file
                f = open('par_transcript.txt', 'r', encoding="utf-8")
                transcript = f.read()
                f.close() 
                                
                
                # --------------------------------------- CHAT Annotations ----------------------------------


                # Unintelligible Speech, Shortenings and Special Utterance Terminators: Remove
                bw_script = (re.sub(r'(\[\/+\])|\(\.+\)|xxx|www|\+\.\.\.|\+\.\.\?|\+\/\.|\+\/\?|\+\/\/\.|\+\/\/\?|\+\"\/\.|\+\"|\+<|\+>|<|>',"", transcript))

                # Undefined character:  Remove
                bw_script = re.sub(r'(â€ž)|‡|„', "", bw_script)

                # Execution [+ exc]:  Remove
                bw_script = re.sub(r'\[\+ \w+\]', "", bw_script)

                # Filler words: Replace symbols with actual words
                # Note: filler words are correlated with fluency
                bw_script = re.sub(r'&-', "and ", bw_script)

                # Word Shortenings:  Replace
                bw_script = re.sub(r'\(([a-zA-Z]+)\)', r"\1", bw_script)

                # Non-verbal gestures and local events: Remove
                bw_script = re.sub(r'&=\w+:*\w*', "", bw_script)

                # Phonological word fragments: Remove (eg. &+dr)
                bw_script = re.sub(r'&\+\w+', "", bw_script)

                # Word combinations: split (e.g oh_god => oh god)
                # Note: doing this before the phonological word fragment will result in splitting picture_no as well
                bw_script = re.sub(r'_', " ", bw_script)


                # --------------------------------- Speech Errors ----------------------------------


                # Neologisms: Let remain. Remove target, target unknown  (eg. wrd@n)
                bw_script = re.sub(r'(\S+)\@n\b', r"\1", bw_script)

                # Morphological error: Remove target and error
                bw_script = re.sub(r'\[: (\w+)\]\s+\[\* m:([a-zA-Z0-9]+)\]', "", bw_script)
                bw_script = re.sub(r'\[\* m:([a-zA-Z0-9]+):[a-z]+\]', "", bw_script)
                bw_script = re.sub(r'\[\* m\]', "", bw_script)

                # Phonological errors: Replace with target word (eg. [:target][* p:n])
                bw_script = re.sub(r'\[: (\w+)\] \[\* p:[nw]\]', r"REP \1", bw_script)
                                
                # Phonological neologism: Replace with target if known (eg.  quack@u [: crack])
                # Note: This might be an annotation error with phonological error missing
                bw_script = re.sub(r'\S+\@u\b\s+\[: (\w+)\]', r"\1", bw_script)

                # Semantic errors: Remove annotation and target 
                bw_script = re.sub(r'\[: (\w+)\]\s+\[\* s:r:\w+:\w+\]', "", bw_script)
                bw_script = re.sub(r'\[: (\w+)\]\s+\[\* s:r:\w+\]', "", bw_script)
                bw_script = re.sub(r'\[: (\w+)\]\s+\[\* s:\w+\]', "", bw_script)

                # Other errors: Remove annotation and replace with target 
                bw_script = re.sub(r'\[: (\w+)\]\s+\[\* d:sw\]', r"REP \1", bw_script)
                bw_script = re.sub(r'\[: (\w+)\]\s+\[\* n:[a-z]+\]', r"REP \1", bw_script)
                
                # Remove any dangling error annotations 
                bw_script = re.sub(r'\[\* \w+:\w+\]', "", bw_script)

                # Replace error and error annotation, target words already replaced
                bw_script = re.sub(r'((\S+\@u\b)|\w+)\s+REP', "", bw_script)

                # Letter sequences: Let remain, target unknown  (eg. wrd@n)
                bw_script = re.sub(r'(\S+)\@[a-z]\b', r"\1", bw_script)

                # Dialectal variations: Replace with target (eg. da [:the])
                bw_script = re.sub(r'\w+\s+\[: (\w+)\]', r"\1", bw_script)

                # ------------------------- Generate final text transcript (clean) ------------------------------


                # Remove *PAR annotation
                bw_script = re.sub(r'\*PAR:\s+', "", bw_script)

                # Remove space before terminators
                bw_script = re.sub(r' (\.|\?|\!)', r"\1", bw_script)

                # Remove empty lines (or with only terminators)
                bw_script = re.sub(r'^(\.|\?|\!)', "", bw_script)

                # Combine new lines and collapse multiple spaces
                bw_script = re.sub(r'\n', "", bw_script)
                bw_script = re.sub(r'\s+', " ", bw_script)

                # Combine into paragraph
                bw_script = ''.join(bw_script)
                           
                # Write to transcript
                with open(os.path.join(directory_path, filename), 'w', encoding="utf-8") as file:
                    file.write(bw_script)

            else:
                os.remove(os.path.join(directory_path, filename))
    else:
        os.remove(os.path.join(directory_path, filename))