(ns pdf-extractor.core
  (:refer-clojure :exclude [name parents])

  (:require [kinsky.client :as client]
            [clojure.tools.logging :as log]
            [clojure.tools.cli :refer [parse-opts]]
            [clojure.string :as str]
            [clojure.java.io :as io]
            [digest :as md5]
            [cheshire.core :as json]
;;            [gresladix :as gspec]
            )
  (:import
    (org.apache.pdfbox.pdmodel PDDocument)
    (java.io File)
    [java.io File  StringWriter]
    (java.util Date)
    (java.time.format DateTimeFormatter)
    (org.apache.pdfbox.text PDFTextStripper)
    )
  (:gen-class))


(defn extract-doc-metadata
  "Extract metadata from a PDDocument instance."
  [pdfDoc]
  (let [
        info       (. pdfDoc getDocumentInformation) ;;guaranteed not null
        author       (. info getAuthor)
        creationDate (. info getCreationDate)
        creator      (. info getCreator)
        last-modify  (. info getModificationDate)
        keywords     (. info getKeywords)
        producer     (. info getProducer)
        subject      (. info getSubject)
        title        (. info getTitle)
        trapped      (. info getTrapped)
        ]
    (conj {}
          [:gresladix/file-is-encrypted_b (. pdfDoc isEncrypted)]
          [:gresladix/file-number-of-pages_i (. pdfDoc getNumberOfPages)]
          [:gresladix/file-pdf-version  (str (. pdfDoc getVersion))]
          (when (some? author)       [:gresladix/file-author_t author])
          (when (some? creationDate) [:gresladix/file-creation_dt (. creationDate getTime)])
          (when (some? creator)      [:gresladix/file-creator_t creator])
          (when (some? keywords)     [:gresladix/file-keywords_t keywords])
          (when (some? last-modify)  [:gresladix/file-lastModified (. last-modify getTime)])
          (when (some? producer)     [:gresladix/file-Producer_t producer])
          (when (some? subject)      [:gresladix/file-Subject_t subject])
          (when (some? title)        [:gresladix/file-Title_t title])
          (when (some? trapped)      [:gresladix/file-Trapped_t trapped])
          ))
  )

(defn extract-pages
  "Extract pages from a PDDocument instance"
  [pdfDoc doc-hash name]
  (let [writer (new StringWriter)
        pageSep "\n====Page Start====\n"
        stripper (new PDFTextStripper)]
    (. stripper setPageStart pageSep)
    (. stripper writeText pdfDoc writer)
    (defn create-doc-map
      "Creates a basic document, a map with the text, page number, title and a hash value"
      [i stri]
      {:gresladix/text stri
       :gresladix/page_i (+ 1 i)
       :gresladix/title_t (#(str %1 "_p" %2) name (+ 1 i))
       ; the page hash is the hash of the doc-hash plus "?page=i"
       :gresladix/hash (md5/md5 (str doc-hash "?page=" i))})
    (def pages (str/split (clojure.string/replace (clojure.string/replace (str (. writer getBuffer)) "-\r\n" "") "\r\n" "\n") #"====Page Start===="))
    (map-indexed create-doc-map pages)
    )
  )

(defn pdf-path-to-pdf-doc-map
  "Extract a PDF metadata using java method"
  [path]
  (let [pdf-file (new File path)
        fname (. pdf-file getName)
        doc-hash (md5/md5 pdf-file)]
    (with-open [PDD (PDDocument/load pdf-file)]
      
      (log/info "Start Extracting data from " fname)
      (conj {} ;;fileMetadata
            [:gresladix/hash doc-hash]
            [:gresladix/page-texts (extract-pages PDD doc-hash fname)]
            [:gresladix/length (. pdf-file length)]
            [:gresladix/path (. pdf-file getAbsolutePath)]
            [:gresladix/name fname]
            [:gresladix/url (str (clojure.java.io/as-url pdf-file))]
            [:gresladix/lastModified (new Date (.lastModified pdf-file))]
            (extract-doc-metadata PDD)
            ))
    ))

(def cli-options
  [;;config file
   ["-c" "--config FILE" "Configuration json file"
    :parse-fn #(json/parse-string (slurp %) (fn [key] (keyword key)))
    ;:validate [#(.exists (io/file %)) "Configuration file must exist"]
    ]])

(defn error-msg [errors]
  (str "The following errors occurred while parsing your command:\n\n"
       (clojure.string/join \newline errors)))

(defn usage [options-summary]
  (->> ["pdf-extractor"
        ""
        "Usage: pdf-extractor [options]"
        ""
        "Options:"
        options-summary
        ""
        ;;"Actions:"
        ;;"  start    Start a new server"
        ;;"  stop     Stop an existing server"
        ;;"  status   Print a server's status"
        ;;""
        "Please refer to the manual page for more information."]
       (clojure.string/join \newline)))

(defn validate-args
  "Validate command line arguments"
  [args]
  (let [{:keys [options arguments errors summary]} (parse-opts args cli-options)]
    (cond
       errors
       {:exit-message (error-msg errors)}
       (:config options)
       {:action "start" :options options}
       :else
       {:exit-message (usage summary)}))
    )

(defn exit [status msg]
  (println msg)
  (System/exit status))

(defn- write-to-out-dir
  [config pdf-path doc-map]
  (defn as-date-string [date]
    (.toString (.toInstant date)))
        
  (defn my-val-writer [key value]
    (if (instance? java.util.Date value) (as-date-string value) value))

  (let [debug-out-dir (get config :debug-out-dir)
        pdf-file (new File pdf-path)
        fname (. pdf-file getName)
        opath (str debug-out-dir "/" fname ".json")]
    (log/info "writing debug of " pdf-path " out to " (io/as-file opath))
    (io/make-parents opath)
    (spit opath
                                        ;(json/write-str doc-map :value-fn my-val-writer)
          (json/generate-string doc-map)
          )
    )
  )

(defn start-extraction
  "Executes the main extraction part"
  [config]
  (log/info "starting-extraction")
  (log/info config)
  (let [_config (get config :net.expertsystem.lab/pdf-extractor)
        pdf-dir (get _config :pdf-dir)
        files-in-dir (file-seq (io/file pdf-dir))
        pdf-files (filter
                   #(and (str/ends-with? % ".pdf") (.isFile %))
                   files-in-dir)
        paths (map str pdf-files)
        debug-out-dir (get _config :debug-out-dir)
        push? (boolean (get _config :push-output false))
        p (client/producer {:bootstrap.servers (get _config :kafka-server)}
                           (client/keyword-serializer)
                           (client/json-serializer))
        ktopic (get _config :raw-doc-ktopic)
        ]

    (log/info "iterating through " (count paths) " PDF files in " pdf-dir " and pushing results to " ktopic)

    (doseq [[i path] (map-indexed vector paths)]
      (log/info (str i ": " path))
      (let [doc-map (pdf-path-to-pdf-doc-map path)]
        (when push?
          (log/trace "pushing to " ktopic)
          ;;(client/send! p ktopic :account-a doc-map)
          ;;(client/flush! p)
          )
        (when debug-out-dir
          (write-to-out-dir _config path doc-map))
        ))
    ))


(defn -main [& args]
  (let [{:keys [action options exit-message ok?]} (validate-args args)]
    (if exit-message
      (exit (if ok? 0 1) exit-message)
      (case action
        "start" (start-extraction (:config options))))))
