import json
import config
import tiktoken
import re
import openpyxl
from utils import get_table_input
from utils import parse_excel_row
from utils import col_num_to_letter
from utils import get_table_fmt_input



class ValueAddress:

    def __init__(self, input_file_path, output_file_path) -> None:
        self.input_file_path = input_file_path
        self.output_file_path = output_file_path
    

    @staticmethod
    def get_row_name(row_num, row_l) -> str:
        res = "["
        for i in range(0 + row_l, row_num + row_l):
            res += str(i+1) + ","
        res = res[:-1]
        res += "]"
        return res
    

    @staticmethod
    def get_col_name(col_num, col_l) -> str:
        res = "["
        for i in range(0 + col_l, col_num + col_l):
            res += str(col_num_to_letter(i+1) + ",")
        res = res[:-1]
        res += "]"
        return res


    def len_change(self, data):
        enc = tiktoken.get_encoding("cl100k_base")
        length = len(enc.encode(data["messages"][0]["content"])) + len(enc.encode(data["messages"][1]["content"])) #+ len(enc.encode(data["messages"][2]["content"]))
        if length <= 4096 - 250:
            len_type = "<4k"
        elif length > 4096 -250 and length <= 32768 - 250:
            len_type = "4-32k"
        else:
            len_type = ">32k"
        return len_type


    def process(self):

        with open(self.input_file_path, 'r', encoding='utf-8') as input_file:
                lines = input_file.readlines()
        new_lines = []
        
        for line in lines:
            data = json.loads(line)
            prompt = data["messages"][1]["content"]
            areas = data["areas1"]
            rows = get_table_input(prompt)
            if config.FMT3_TAG:
                fmt = get_table_fmt_input(prompt)
            else:
                fmt = None
            
            res = []
            input = [parse_excel_row(row) for row in rows]
            first_address = input[0][0].split(",", 1)[0]
            matches = re.match(r"([A-Z]+)(\d+)", first_address)
            uppercase_part = matches.group(1)
            numeric_part = matches.group(2)
            row_l = int(numeric_part) - 1
            col_l = openpyxl.utils.column_index_from_string(uppercase_part) - 1
            row_num = len(input)
            col_num = len(input[0])
            for area in areas:
                data_type = area[1]
                begin_address = area[0].split(":")[0]
                end_address = area[0].split(":")[1]
                if isinstance(data_type, int):
                    if data_type == 0:
                        area[1] = "YearData"
                
                    elif data_type == 1:
                        area[1] = "IntNum"
                    
                    elif data_type == 2:
                        area[1] = "FloatNum"
                
                    elif data_type == 3:
                        area[1] = "PercentageNum"
                
                    elif data_type == 4:
                        area[1] = "SentificNum"
                    elif data_type == 5:
                        area[1] = "DateData"
                
                    elif data_type == 6:
                        area[1] = "TimeData"
                
                    elif data_type == 7:
                        area[1] = "CurrencyData"
                
                    elif data_type == 8:
                        area[1] = "EmailData"


                if begin_address == end_address:
                    res.append((area[1], begin_address))
                else:
                    res.append((area[1], area[0]))

            new_input = ""
            for r in res:
                #without 2
                #new_input += "(" + r[1] + "|" + r[0] + ")" + ","
                
                #with 2
                new_input += "(" + r[0] + "|" + r[1] + ")" + ","
            new_input = new_input[:-1]

            description = "The spreadsheet has " + str(row_num) + " rows and " + str(col_num) + " columns. " + "Column names:" + self.get_col_name(col_num, col_l) + "; " + "Row numbers:" + self.get_row_name(row_num, row_l) 

            if config.FMT3_TAG:
                new_instructions = r"Instruction: Given an input that is a string denoting data of cells in a Excel spreadsheet. The input spreadsheet contains many tuples, describing the cells with content in the spreadsheet. Each tuple consists of two elements separated by a '|': the cell content and the cell address/region, like (Year|A1), ( |A1) or (IntNum|A1:B3). The content in some cells such as '#,##0'/'d-mmm-yy'/'H:mm:ss',etc., represents the CELL DATA FORMATS of Excel. The content in some cells such as 'IntNum'/'DateData'/'EmailData',etc., represents a category of data with the same format and similar semantics. For example, 'IntNum' represents integer type data, and 'ScientificNum' represents scientific notation type data. 'A1:B3' represents a region in spreadsheet, from the first row to the third row and from column A to column B. Some cells with empty content in the spreadsheet are not entered.  In addition, a dictionary will be provided to record the format information. The key of the dictionary is the format feature, and the value is the cell area with the feature: format_dict: {'Top Border': ['A13:D13', 'A14:D14'], 'Bottom Border': ['A12:D12', 'A13:D13', 'A26:D26'], 'Left Border': [], 'Right Border': ['A13:A26', 'B13:B26', 'C13:C26', 'D13:D26'], 'Fill Color': [], 'Font Bold': ['A1:F1']}. Cells with the same format may have similar semantic information and structural connections, which may help you to understand the table. Now you should tell me the range of the table in a format like A2:D5, and the range of table should only CONTAIN HEADER REGION and the data region, DON'T include the title or comments. Note that there can be more than one table in a string, so you should return all the RANGE, LIKE [{'range': 'A1:F9'}, {'range': 'A12:F18'}]. DON'T ADD OTHER WORDS OR EXPLANATION."
                new_prompt = new_instructions + "\nDescription: " + description + "\nInput: " + new_input + "\nFormat_Dict: " + fmt

            else:
                #1+3/3
                #new_instructions = r"Instruction: Given an input that is a string denoting data of cells in a Excel spreadsheet. The input spreadsheet contains many tuples, describing the cells with content in the spreadsheet. Each tuple consists of two elements separated by a '|': the cell address/region and the cell content , like (A1|Year), (A1| ) or (A1:B3|IntNum). The content in some cells such as '#,##0'/'d-mmm-yy'/'H:mm:ss',etc., represents the CELL DATA FORMATS of Excel. The content in some cells such as 'IntNum'/'DateData'/'EmailData',etc., represents a category of data with the same format and similar semantics. For example, 'IntNum' represents integer type data, and 'ScientificNum' represents scientific notation type data. 'A1:B3' represents a region in spreadsheet, from the first row to the third row and from column A to column B. Some cells with empty content in the spreadsheet are not entered. Now you should tell me the range of the table in a format like A2:D5, and the range of table should only CONTAIN HEADER REGION and the data region, DON'T include the title or comments. Note that there can be more than one table in a string, so you should return all the RANGE, LIKE [{'range': 'A1:F9'}, {'range': 'A12:F18'}]. DON'T ADD OTHER WORDS OR EXPLANATION."
                
                #2/1+2
                #new_instructions = r"Instruction: Given an input that is a string denoting data of cells in a Excel spreadsheet. The input spreadsheet contains many tuples, describing the cells with content in the spreadsheet. Each tuple consists of two elements separated by a '|': the cell address/region and the cell content , like (Year|A1), ( |A1) or (20|A1:B3). 'A1:B3' represents a region in spreadsheet, from the first row to the third row and from column A to column B. Some cells with empty content in the spreadsheet are not entered. Now you should tell me the range of the table in a format like A2:D5, and the range of table should only CONTAIN HEADER REGION and the data region, DON'T include the title or comments. Note that there can be more than one table in a string, so you should return all the RANGE, LIKE [{'range': 'A1:F9'}, {'range': 'A12:F18'}]. DON'T ADD OTHER WORDS OR EXPLANATION."
                
                #1+2+3/2+3    
                new_instructions = r"Instruction: Given an input that is a string denoting data of cells in a Excel spreadsheet. The input spreadsheet contains many tuples, describing the cells with content in the spreadsheet. Each tuple consists of two elements separated by a '|': the cell content and the cell address/region, like (Year|A1), ( |A1) or (IntNum|A1:B3). The content in some cells such as '#,##0'/'d-mmm-yy'/'H:mm:ss',etc., represents the CELL DATA FORMATS of Excel. The content in some cells such as 'IntNum'/'DateData'/'EmailData',etc., represents a category of data with the same format and similar semantics. For example, 'IntNum' represents integer type data, and 'ScientificNum' represents scientific notation type data. 'A1:B3' represents a region in spreadsheet, from the first row to the third row and from column A to column B. Some cells with empty content in the spreadsheet are not entered. Now you should tell me the range of the table in a format like A2:D5, and the range of table should only CONTAIN HEADER REGION and the data region, DON'T include the title or comments. Note that there can be more than one table in a string, so you should return all the RANGE, LIKE [{'range': 'A1:F9'}, {'range': 'A12:F18'}]. DON'T ADD OTHER WORDS OR EXPLANATION."
                
        

                new_prompt = new_instructions + "\nDescription: " + description + "\nInput: " + new_input

            data["messages"][1]["content"] = new_prompt

            del data["areas"]
            del data["areas1"]

            data["now_length"] = self.len_change(data)
            new_lines.append(json.dumps(data, ensure_ascii=False) + '\n')

        with open(self.output_file_path, 'w', encoding='utf-8') as output_file:
            output_file.writelines(new_lines)


if __name__ == '__main__':
    valueaddress = ValueAddress(config.VA_INPUT_FILE_PATH, 
                                config.VA_OUTPUT_FILE_PATH
                                )
    valueaddress.process()

