from typing import List

import pandas as pd
from akin import LSH, MinHash


def get_near_duplicates(
    docs: List[str],
    queries: List[str],
    docs_label_prefix: str = "doc",
    queries_label_prefix: str = "query",
    n_gram_type: str = "char",
    n_gram: int = 4,
    permutations: int = 100,
    hash_bits: int = 64,
    no_of_bands: int = 50,
    seed: int = 3,
    min_jaccard: float = 0.55,
) -> pd.DataFrame:
    docs_labels = [f"{docs_label_prefix}{i}" for i in range(len(docs))]
    docs_minhash = MinHash(
        docs, n_gram_type=n_gram_type, n_gram=n_gram, permutations=permutations, hash_bits=hash_bits, seed=seed
    )  # docs_minhash.signatures contain the dense vectors, which were created after
    # shingling the input text into sparse vectors
    lsh = LSH(
        minhash=docs_minhash,
        labels=docs_labels,
        no_of_bands=no_of_bands,
    )
    # TODO the above LSH implementation is very slow. Rahter use Faiss Hyperplane-based LSH explained here:
    # https://www.pinecone.io/learn/series/faiss/locality-sensitive-hashing-random-projection/
    # import faiss
    # d = wb.shape[1] # vector dimensionality (len(minhash.signatures[0]))
    # nbits = 4 # determines speed and accuracy

    # # initialize the index using our vectors dimensionality (128) and nbits
    # index = faiss.IndexLSH(d, nbits)
    # # then add the data
    # index.add(wb)

    lsh.update(
        minhash=MinHash(queries, n_gram=n_gram, permutations=permutations, hash_bits=hash_bits, seed=seed),
        new_labels=[f"{queries_label_prefix}{i}" for i in range(len(queries))],
    )

    adjacency_list = lsh.adjacency_list(min_jaccard=min_jaccard)
    columns = list(adjacency_list.keys())
    adjacency_data = [[column in values for column in columns] for values in adjacency_list.values()]
    data = pd.DataFrame(index=columns, columns=columns, data=adjacency_data)
    return data
