(*---------------------------------------------------------------------------
   Copyright (c) 2021 The cmarkit programmers. All rights reserved.
   SPDX-License-Identifier: ISC
  ---------------------------------------------------------------------------*)

let str = Format.sprintf
let exec = Filename.basename Sys.executable_name

let pp_uchar ppf u = Format.fprintf ppf "U+%04X" (Uchar.to_int u)

let fold_uchars f acc =
  let rec loop f acc u =
    let acc = f acc u in
    if Uchar.equal u Uchar.max then acc else loop f acc (Uchar.succ u)
  in
  loop f acc Uchar.min

let sat_list p =
  let add acc u = if p u then u :: acc else acc in
  List.rev (fold_uchars add [])

(* See https://spec.commonmark.org/current/#unicode-whitespace-character *)
let is_whitespace u =
  let is_zs = Uucp.Gc.general_category u = `Zs in
  let u = Uchar.to_int u in
  is_zs || u = 0x0009 || u = 0x000A || u = 0x000C || u = 0x000d

(* See https://spec.commonmark.org/current/#ascii-punctuation-character *)
let is_ascii_punctuation u =
  let u = Uchar.to_int u in
  (0x0021 <= u && u <= 0x002F) ||
  (0x003A <= u && u <= 0x0040) ||
  (0x005B <= u && u <= 0x0060) ||
  (0x007B <= u && u <= 0x007E)

let is_punctuation u = match Uucp.Gc.general_category u with
| `Pc | `Pd | `Pe | `Pf | `Pi | `Po | `Ps -> true
| `Sm | `Sc | `Sk | `So -> true
| _ -> is_ascii_punctuation u

let whitespace_list = sat_list is_whitespace
let punctuation_list = sat_list is_punctuation

let case_fold_map =
  let uchar_map acc u = match Uucp.Case.Fold.fold u with
  | `Self -> acc
  | `Uchars f ->
      let esc u = Printf.sprintf "\\u{%04X}" (Uchar.to_int u) in
      (u, String.concat "" (List.map esc f)) :: acc
  in
  List.rev (fold_uchars uchar_map [])

let byte_size v =
  let words = Obj.reachable_words (Obj.repr v) in
  (words / (Sys.word_size / 8))

let case_fold_count =
  let add acc u = match Uucp.Case.Fold.fold u with
  | `Self -> acc | `Uchars _ -> acc + 1
  in
  fold_uchars add 0

let test () =
  Printf.printf "whitespace: %d characters\n" (List.length whitespace_list);
  Printf.printf "punctuation: %d characters\n" (List.length punctuation_list);
  Printf.printf "non-id case fold: %d characters\n" case_fold_count;
  ()

let year = (Unix.gmtime (Unix.gettimeofday ())).Unix.tm_year + 1900

let gen ppf () =
  let pp_cp ppf u = Format.fprintf ppf "0x%04X" (Uchar.to_int u) in
  let pp_binding ppf (u, s) = Format.fprintf ppf "%a, \"%s\"" pp_cp u s in
  let pp_sep ppf () = Format.fprintf ppf ";@ " in
  let pp_cps ppf us = Format.pp_print_list ~pp_sep pp_cp ppf us in
  let pp_map ppf m = Format.pp_print_list ~pp_sep pp_binding ppf m in
  Format.fprintf ppf
{|(*---------------------------------------------------------------------------
   Copyright (c) %d The cmarkit programmers. All rights reserved.
   SPDX-License-Identifier: ISC
  ---------------------------------------------------------------------------*)

(* Do not edit. Data generated by support/unicode_data.ml *)

let unicode_version = "%s"

let whitespace =@?
  @[<1>[|%a|]@]

let punctuation =@?
  @[<1>[|%a|]@]

let case_fold =@?
  @[<1>[|%a|]@]
%!|} year Uucp.unicode_version pp_cps whitespace_list pp_cps punctuation_list
     pp_map case_fold_map


let process do_test outf =
  if do_test then test () else
  let generate pp outf =
    try
      let oc = if outf = "-" then stdout else open_out_bin outf in
      try
        let ppf = Format.formatter_of_out_channel oc in
        pp ppf ();
        Format.pp_print_flush ppf ();
        close_out oc
      with Sys_error _ as e -> close_out oc; raise e
    with Sys_error e -> Printf.eprintf "%s\n%!" e; exit 1
  in
  generate gen outf

let main () =
  let usage = str "Usage: %s [OPTION]…\nOptions:" exec in
  let test = ref false in
  let outf = ref None in
  let options =
    [ "-o", Arg.String (fun s -> outf := Some s),
      "<FILE> output file, defaults to src/cmarkit_data_uchar.ml";
      "-t", Arg.Set test,
      "Do not generate, test data";
    ]
  in
  let no_pos s = raise (Arg.Bad (str "Don't know what to do with %S" s)) in
  Arg.parse (Arg.align options) no_pos usage;
  let outf = Option.value ~default:"src/cmarkit_data_uchar.ml" !outf in
  process !test outf;
  0

let () = if !Sys.interactive then () else exit (main ())
