import csv
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
class Metaphor84():
    def __init__(self, file):
        dataset = {}
        with open(file) as f:
            csv_reader = csv.reader(f)
            while True:
                try:
                    line = next(csv_reader)
                except StopIteration as e:
                    break
#                 target, source = [lemmatizer.lemmatize(i.strip().lower()) for i in line[0].split('-')]
                target, source = [i.strip().lower() for i in line[0].split('-')]
                if target not in dataset:
                    dataset[target] = {}
                assert source not in dataset[target]
                dataset[target][source] = {}
                
                attrs = {}
                while True:
                    line = next(csv_reader)
                    if not line[1]:
                        # 属性读取完了
                        break
                    # Freq,Sal,Conn
#                     print(line)
#                     attr = lemmatizer.lemmatize(line[1].strip().lower())
                    attr = line[1].strip().lower()
                    attrs[attr] = [0 if i=='' else float(i) for i in line[2:]]
                
                line = next(csv_reader)
                sr = int(line[2])
                line = next(csv_reader)
                tr = int(line[2])
                
                dataset[target][source]['attr'] = attrs
                dataset[target][source]['sr'] = sr
                dataset[target][source]['tr'] = tr
                
        self.dataset = dataset
        
    def getDatasetByFilter(self, filter):
        res = {}
        for target, target_vals in self.dataset.items():
            for source, vals in target_vals.items():
                attrs = vals['attr']
                sr = vals['sr']
                tr = vals['tr']
                for attr, attr_val in attrs.items():
                    # 写条件 (可以抽象)
                    if filter(attr_val):
                        if target not in res:
                            res[target] = {}
                        if source not in res[target]:
                            res[target][source] = []
                        res[target][source].append(attr)
        return res
    
    def testDataset(self):
        def filter(attr_val):
            return attr_val[0]>=5
        return self.getDatasetByFilter(filter)
    def devDataset(self):
        def filter(attr_val):
            return attr_val[0]<5
        return self.getDatasetByFilter(filter)
    def toTriple(self, dataset):
        res = []
        for target, target_val in dataset.items():
            for source, attrs in target_val.items():
                for attr in attrs:
                    res.append((target, source, attr))
        return res
    def length(self, dataset):
        pair_len = 0
        attr_len = 0
        for target,target_val in dataset.items():
            pair_len += len(target_val.keys())
            for source, source_val in target_val.items():
                attr_len += len(source_val)
        # 长度: 本体  本体-喻体 本体-喻体-属性
        return len(dataset.keys()), pair_len, attr_len
    
    
from itertools import chain
from nltk.corpus import wordnet as wn
def getSynonyms(words):
    res = {}
    for word in words:
        tmp = set()
        # 如果是多个单词组成的词组, 用他的最后一个单词  比如 time bomb 这种, 只用bomb
        if ' ' in word:
            word = word.split(' ')[-1]
        # open/close 把两个单词的同义词集都放一起去
        for w in word.split('/'):
            synsets = wn.synsets(w)
            tmp.update(set(chain(*[synset.lemma_names() for synset in synsets])))
        res[word] = tmp
    return res
getSynonyms(['car', 'time bomb'])