 #
 #     BenchIE: A Framework for Multi-Faceted Fact-Based Open Information Extraction Evaluation
 #
 #        File:  oie_extractions.py
 #
 #     Authors: Deleted for purposes of anonymity
 #
 #     Proprietor: Deleted for purposes of anonymity --- PROPRIETARY INFORMATION
 #
 # The software and its source code contain valuable trade secrets and shall be maintained in
 # confidence and treated as confidential information. The software may only be used for
 # evaluation and/or testing purposes, unless otherwise explicitly stated in the terms of a
 # license agreement or nondisclosure agreement with the proprietor of the software.
 # Any unauthorized publication, transfer to third parties, or duplication of the object or
 # source code---either totally or in part---is strictly prohibited.
 #
 #     Copyright (c) 2021 Proprietor: Deleted for purposes of anonymity
 #     All Rights Reserved.
 #
 # THE PROPRIETOR DISCLAIMS ALL WARRANTIES, EITHER EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO IMPLIED WARRANTIES OF MERCHANTABILITY
 # AND FITNESS FOR A PARTICULAR PURPOSE AND THE WARRANTY AGAINST LATENT
 # DEFECTS, WITH RESPECT TO THE PROGRAM AND ANY ACCOMPANYING DOCUMENTATION.
 #
 # NO LIABILITY FOR CONSEQUENTIAL DAMAGES:
 # IN NO EVENT SHALL THE PROPRIETOR OR ANY OF ITS SUBSIDIARIES BE
 # LIABLE FOR ANY DAMAGES WHATSOEVER (INCLUDING, WITHOUT LIMITATION, DAMAGES
 # FOR LOSS OF BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF INFORMATION, OR
 # OTHER PECUNIARY LOSS AND INDIRECT, CONSEQUENTIAL, INCIDENTAL,
 # ECONOMIC OR PUNITIVE DAMAGES) ARISING OUT OF THE USE OF OR INABILITY
 # TO USE THIS PROGRAM, EVEN IF the proprietor HAS BEEN ADVISED OF
 # THE POSSIBILITY OF SUCH DAMAGES.
 #
 # For purposes of anonymity, the identity of the proprietor is not given herewith.
 # The identity of the proprietor will be given once the review of the
 # conference submission is completed.
 #
 # THIS HEADER MAY NOT BE EXTRACTED OR MODIFIED IN ANY WAY.
 #

import re
import numpy as np
import pdb

class OIEExtractions():
    def __init__(self) -> None:
        """
            Default constructor for the OIEExtractions class. This class contains arguments and methods that store and 
            manipulate data about OIE extractions from OIE systems.

            Args
            ----
                oie_system: str
                    The name of the OIE system
                extractions: list
                    List of all extractions by OIE system. It is a list of lists, where the elements are:
                    [sent id, subj, rel, obj]
                stats: dict
                    A dictionary containing statistics about the extractions (avg. number of extractions per sentence, avg. length of extractions, etc.).
                    Key: statistic name; value: the statistic value
        """
        self.oie_system = ""
        self.extractions = []
        self.stats = {}
        self.stats['avg_token_count'] = 0.0

    def load_oie_extractions(self, filename: str):
        """ 
            Load OIE extractions done by OIE systems.
            
            Args
            ----
                filename: str
                    filename where the extractions are stored. The expected format about how the triples are written is: 
                            sent \t id \t subj \t rel \t obj 
        """

        with open(filename, 'r') as f:
            self.extractions = [line.strip() for line in f]
            for i in range(len(self.extractions)):
                extraction = re.split(r'\t',self.extractions[i])
                self.extractions[i] = extraction
        
        
    def set_oie_system_name(self, oie_name: str):
        """
            Set the name of the OIE system which extracted the triples

            Args
            ----
                oie_name: str
                    the name of the OIE system
        """
        self.oie_system = oie_name

    def compute_stats(self):
        """
            Compute the stats for the extractions. Note that for this function to work, the data needs to be loaded first (with load_oie_extractions)
        """
        token_count = np.zeros(len(self.extractions))
        for i in range(len(self.extractions)):
            l = len(self.extractions[i][1].split(" ")) + len(self.extractions[i][2].split(" ")) + len(self.extractions[i][3].split(" "))
            token_count[i] = l
        self.stats['avg_token_count'] = np.mean(token_count)

    def get_subset_extractions(self, sent_ids: list):
        """
            Return a subset of the OIE extractions based on the provided list of sentence IDs.

            Args
            ----
                sent_ids: list
                    List of sentence IDs

            Returns
            -------
                subset: OIEExtractions
                    The subset of the OIE extractions
        """
        subset = OIEExtractions()
        
        for s_id in sent_ids:
            subset.oie_system = self.oie_system
            subset.stats = {}
            subset.stats['avg_token_count'] = 0.0
            for ex in self.extractions:
                if ex[0] == s_id:
                    subset.extractions.append(ex)

        return subset
