import json
import re
from qa_utils import get_qa_region, find_ori_table, cal_token
from openai import AzureOpenAI


client_phase1 = AzureOpenAI(
    azure_endpoint ="",
    api_key = "", 
    api_version = "2024-02-15-preview"
)


def get_new_qa_prompt(new_rows, question):

    QA_instruction = r"Instruction: Given an input that is a string denoting data of cells in a table and a question about this table. The input table includes many pairs, and each pair consists of a cell address and the text in that cell with a ',' in between, like 'A1,Year'. Cells are separated by '|' like 'A1,Year|A2,Profit'. The text can be empty so the cell data is like 'A1, |A2,Profit'. The cells are organized in row-major order. The answer to the input question is contained in the input table and can be represented by cell address. I need you to find the cell address of the answer in the given table based on the given problem description, and return the answer like '{[B3]}' or '{[SUM(A2:A10)]}'. DON'T ADD ANY OTHER WORDS." 
    new_input = ""
    for new_row in new_rows:
        new_input += new_row + "\n"
        
    new_prompt = QA_instruction + "\nQUESTION: " + question + "\nInput: " + new_input  + "\n\n##\n\n"
    return new_prompt

def phase1_infer(prompt, question):

    old_input = prompt.split("\nDescription: ")[1]
    table_instruction = r"Instruction: Given an input that is a string denoting data of cells in a table. The input table contains many tuples, describing the cells with content in the spreadsheet.  Each tuple consists of two elements separated by a '|': the cell content and the cell address/region, like (Year|A1), ( |A1) or (IntNum|A1:B3). The content in some cells such as '#,##0'/'d-mmm-yy'/'H:mm:ss',etc., represents the CELL DATA FORMATS of Excel. The content in some cells such as 'IntNum'/'DateData'/'EmailData',etc., represents a category of data with the same format and similar semantics. For example, 'IntNum' represents integer type data, and 'ScientificNum' represents scientific notation type data. 'A1:B3' represents a region in spreadsheet, from the first row to the third row and from column A to column B. Some cells with empty content in the spreadsheet are not entered."
    table_instruction1_2 = r"Instruction: Given an input that is a string denoting data of cells in a Excel spreadsheet. The input spreadsheet contains many tuples, describing the cells with content in the spreadsheet. Each tuple consists of two elements separated by a '|': the cell address/region and the cell content , like (A1|Year), (A1| ) or (A1:B3|20). 'A1:B3' represents a region in spreadsheet, from the first row to the third row and from column A to column B. Some cells with empty content in the spreadsheet are not entered. Now you should tell me the range of the table in a format like A2:D5, and the range of table should only CONTAIN HEADER REGION and the data region, DON'T include the title or comments. Note that there can be more than one table in a string, so you should return all the RANGE, LIKE [{'range': 'A1:F9'}, {'range': 'A12:F18'}]. DON'T ADD OTHER WORDS OR EXPLANATION."
    table_instruction1_3 = r"Instruction: Given an input that is a string denoting data of cells in a Excel spreadsheet. The input spreadsheet contains many tuples, describing the cells with content in the spreadsheet. Each tuple consists of two elements separated by a '|': the cell address/region and the cell content , like (A1|Year), (A1| ) or (A1:B3|IntNum). The content in some cells such as '#,##0'/'d-mmm-yy'/'H:mm:ss',etc., represents the CELL DATA FORMATS of Excel. The content in some cells such as 'IntNum'/'DateData'/'EmailData',etc., represents a category of data with the same format and similar semantics. For example, 'IntNum' represents integer type data, and 'ScientificNum' represents scientific notation type data. 'A1:B3' represents a region in spreadsheet, from the first row to the third row and from column A to column B. Some cells with empty content in the spreadsheet are not entered."
    table_instruction2_3 = table_instruction
    new_instruction = "\nHow many tables are there in the spreadsheet? Below is a question about one certain table in this spreadsheet. I need you to determine in which table the answer to the following question can be found, and return the RANGE of the ONE table you choose, LIKE [{'range': 'A1:F9'}]."
    new_prompt = table_instruction2_3 + "\nDescription: " + old_input + new_instruction + "\nQUESTION: " + question
    message_text = [{"role": "system", "content": "You are an AI assistant that helps people find information."}, 
                    {"role": "user", "content": new_prompt}
                    ]
    completion = client_phase1.chat.completions.create(
        model="va_nfs_fmt0_4k-ft-gpt4-v4",
        messages = message_text,
        temperature=0,
        max_tokens=300,
        top_p=0.99,
        frequency_penalty=0,
        presence_penalty=0,
        stop=None
    )
    return f"{completion}".split("message=ChatCompletionMessage(content=")[1].split(", role='assistant'")[0]

def phase1(file_name, question):
    with open("QAcode\qadata_method2+3.jsonl",'r',encoding='utf-8') as input_file:
        for line in input_file:
            data = json.loads(line)
            if data["file_name"] == file_name:
                if data["now_length"] != "<4k":
                    return -1
                prompt = data["messages"][1]["content"]
                ans = phase1_infer(prompt, question)    
                print("ans:", ans)

                pattern_pre = r"'(.*?)'"  
                matches = re.findall(pattern_pre, ans)  
                if len(matches) != 2:
                    print(file_name)
                    return -1
                else:
                    return [matches[1]]
        print("没有找到和问题匹配的原始数据")
        return -1


def coordinate_mapping(region, file_name):
    with open("QAcode\qadata_coordinate_mapping.jsonl",'r',encoding='utf-8') as file:
        for line in file:
            data = json.loads(line)
            if data["file_name"] == file_name:
                position_dic = data["reflection"]

    addresses = region[0].split(":")
    new_addresss = position_dic[addresses[0]] + ":" + position_dic[addresses[1]]
    return new_addresss



def main():
    count = 1
    with open("QAcode\data_eng_all.jsonl",'r',encoding='utf-8') as input_file, open("QAcode/intermediate_data_method2+3_1.jsonl",'w',encoding='utf-8') as output_file:
        for line in input_file:
            data = json.loads(line)
            file_name = data["file_name"]
            question = data["question"]
            answer = data["answer"]
            
                
            region = phase1(file_name, question) 
            print("region:", region)
            ori_input_rows = find_ori_table(file_name, "QAcode/ori_data.jsonl")
            if region == -1:
                item1 = {
                "file_name": file_name,
                "question": question,
                "answer": answer,
                "qa_table_rows": ori_input_rows,
            }
            
            if region != -1:
                
                #region_before_compress = coordinate_mapping(region, file_name)  #region_before_compress格式：'A1:B8'  字符串
                region_before_compress = region[0]
                print("region_before_compress:", region_before_compress)

                qa_table_rows = get_qa_region(ori_input_rows, region_before_compress)
                item1 = {
                    "file_name": file_name,
                    "question": question,
                    "answer": answer,
                    "qa_table_rows": qa_table_rows,
                    "region_before_compress": region_before_compress,
                }
            output_file.write(json.dumps(item1) + "\n")
            count += 1
            print(count)


main()
