
(* #load "str.cmxa";; *)
(* #load "scripts/trees.ml";; *)

open Str
open String
open Trees


let thresh = (int_of_string (Sys.argv.(1)));;


(* formatting exception *)
exception WrongFormat of string;;


let ltrees = ref [];;
let hC = Hashtbl.create 1000;;
let hP = Hashtbl.create 1000;;
let hL = Hashtbl.create 1000;;


(* get counts of categories *)
let rec update_counts = function
    End -> ()
  | ChildList(cl1,cl2) ->
      update_counts (cl1);
      update_counts (cl2)
  | Term(s) when string_match (regexp "\\(.*\\)#") s 0 ->
      let p = matched_group 1 s in
      Hashtbl.replace hP p (1 + (if (Hashtbl.mem hP p) then (Hashtbl.find hP p) else 0))
  | Term(s) when string_match (regexp "\\(^.*\\):\\(.*\\){\\(.*\\)}.*$") s 0 ->
      let l = matched_group 1 s in
      let c = matched_group 2 s in
      Hashtbl.replace hL l (1 + (if (Hashtbl.mem hL l) then (Hashtbl.find hL l) else 0));
      Hashtbl.replace hC c (1 + (if (Hashtbl.mem hC c) then (Hashtbl.find hC c) else 0))
  | Term(s) -> raise (WrongFormat s);;


(* Replace rare categories *)
let rec replace_rare_cats = function
    End -> End
  | ChildList(cl1,cl2) ->
      ChildList(replace_rare_cats (cl1),replace_rare_cats (cl2))
  | Term(s) when string_match (regexp "\\(.*\\)#\\(.*\\)") s 0 ->
      let p = matched_group 1 s in
      let w = matched_group 2 s in
      Term((if (Hashtbl.find hP p >= thresh) then p else "Unk")^"#"^w)
  | Term(s) when string_match (regexp "^\\(.*\\):\\(.*\\){\\(.*\\)}\\(.*\\)$") s 0 ->
      let l = matched_group 1 s in
      let c = matched_group 2 s in
      let e = matched_group 3 s in
      let r = matched_group 4 s in
      Term((if (Hashtbl.find hL l >= thresh) then l else "Unk")^":"^(if (Hashtbl.find hC c >= thresh) then c else "Unk")^"{"^e^"}"^r)
  | Term(s) -> raise (WrongFormat s);;


(* read loop *)
try
  while true do
    let s = input_line stdin in     (* "(S (NP the cat) (VP sat down))" in *)
    let r,t = tree_of_string s in
    ltrees := t :: !ltrees ;
    update_counts (t)
    (* print_endline (string_of_tree t) *)
  done;
  None
with
  End_of_file -> None
;;
(* write loop *)
List.iter (function t ->
  print_endline (string_of_tree (replace_rare_cats (t)))
) !ltrees
;;



