(*--------------------------------------------------------------------------- Copyright (c) 2015 Daniel C. Bünzli. All rights reserved. Distributed under the BSD3 license, see license at the end of the file. %%NAME%% release %%VERSION%% ---------------------------------------------------------------------------*) (* {e Very} naïve implementation of minimal acyclic finite state automaton and its use as an indexing stucture. Haunted by automata ? > ocamlbuild index.native > ./index.native - > fsa.gv ... Type words separated by newlines > dot -Tpdf fsa.gv > fsa.pdf *) module Label = struct type t = int (* byte, really *) let compare : t -> t -> int = Pervasives.compare end module State : sig type t val empty : t val accepting : t -> bool -> t val add_arc : t -> Label.t -> t -> t val accepts : t -> bool val find : t -> Label.t -> t option val compare : t -> t -> int val fold_arcs : (Label.t -> t -> 'a -> 'a) -> t -> 'a -> 'a val tag : t -> int val retag : t -> int -> t end = struct module Lmap = Map.Make (Label) type t = { accepts : bool; arcs : arcs; tag : int } and arcs = t Lmap.t let empty = { accepts = false; arcs = Lmap.empty; tag = 0 } let accepting s accepts = { s with accepts } let add_arc s l t = { s with arcs = Lmap.add l t s.arcs } let accepts s = s.accepts let find s l = try Some (Lmap.find l s.arcs) with Not_found -> None let compare s0 s1 = (* Assumes an acyclic graph of states *) let c = Pervasives.compare s0.accepts s1.accepts in if c <> 0 then c else Lmap.compare compare s0.arcs s1.arcs let fold_arcs f s acc = Lmap.fold f s.arcs acc let tag s = s.tag let retag s tag = { s with tag } end module Fsa : sig type t val accepts : t -> string -> bool val empty : t val add_word : t -> string -> t val minimize : t -> t type index val index : t -> index val index_word_count : index -> int val find_index : index -> string -> int option val find_word : index -> int -> string option val pp_dot : Format.formatter -> index -> unit end = struct type t = State.t let accepts s w = let max = String.length w - 1 in let rec loop k s = if k > max then State.accepts s else match State.find s (Char.code w.[k]) with | None -> false | Some t -> loop (k + 1) t in loop 0 s let empty = State.empty let find_prefix s w = let max = String.length w - 1 in let rec loop k rev_path = if k > max then k, rev_path else let s = List.hd rev_path in match State.find s (Char.code w.[k]) with | None -> k, rev_path | Some t -> loop (k + 1) (t :: rev_path) in loop 0 [s] let add_branch stuck w ~pos = let max = String.length w - 1 in let rec loop k next = let l = Char.code w.[k] in if k = pos then (State.add_arc stuck l next) else loop (k - 1) (State.add_arc State.empty l next) in if pos > max then State.accepting stuck true else loop max (State.accepting State.empty true) let rec rebuild w ~pos rev_path = match rev_path with | [start] -> start | t :: s :: rev_path -> let s = State.add_arc s (Char.code w.[pos]) t in rebuild w ~pos:(pos - 1) (s :: rev_path) | [] -> assert false let add_word s w = let pos, rev_path = find_prefix s w in let stuck = List.hd rev_path in let unstuck = add_branch stuck w ~pos in rebuild w ~pos:(pos - 1) (unstuck :: List.tl rev_path) module Sset = Set.Make (State) let minimize s = (* Not t.r. depth is bounded by max added word length. *) let rec minimize exists s = try exists, Sset.find s exists (* replace *) with | Not_found -> let explore_arc l t (exists, s) = let exists, t = minimize exists t in exists, State.add_arc s l t in let empty = State.(accepting empty (accepts s)) in let init = exists, empty in let exists, s = State.fold_arcs explore_arc s init in Sset.add s exists, s in snd (minimize Sset.empty s) type index = t let index s = let s = minimize s in (* Not t.r. depth is bounded by max added word length. Computes more than needed, since we don't maintain the set of visited nodes. *) let rec retag s = let explore_arc l t (wcount, s) = let t = retag t in (wcount + State.tag t, State.add_arc s l t) in let wcount = if State.accepts s then 1 else 0 in let empty = State.(accepting empty (accepts s)) in let init = wcount, empty in let wcount, s = State.fold_arcs explore_arc s init in State.retag s wcount in retag s let index_word_count s = State.tag s let find_index s w = let max = String.length w - 1 in let rec loop k s i = if k > max then (if State.accepts s then Some i else None) else let l = Char.code w.[k] in match State.find s l with | None -> None | Some t -> let add_skip l' t i = if l' < l then (State.tag t) + i else i in let i = State.fold_arcs add_skip s i in loop (k + 1) t (if State.accepts s then (i + 1) else i) in loop 0 s 0 let find_word s i = let b = Buffer.create 42 in let rec loop s count = let count = if State.accepts s then count - 1 else count in if count = 0 then Some (Buffer.contents b) else let find_arc l t (found, count as acc) = match found with | Some _ -> acc | None -> let wcount = State.tag t in if wcount < count then (found, count - wcount) else (Buffer.add_char b (Char.chr l); Some t, count) in match State.fold_arcs find_arc s (None, count) with | None, _ -> None (* i is out of bounds *) | Some t, count -> loop t count in loop s (i + 1) let identifier () = (* assigns ids to states. *) let module Smap = Map.Make (State) in let id = ref ~-1 in let sids = ref Smap.empty in fun s -> try Smap.find s !sids, false with | Not_found -> incr id; sids := Smap.add s !id !sids; !id, true let pp_dot ppf s = (* Not t.r. depth is bounded by max added word length. *) let pp = Format.fprintf in let pp_label ppf l = if 0x20 <= l && l <= 0x7E then pp ppf "%c" (Char.chr l) else pp ppf "x%X" l in let get_id = identifier () in let rec pp_state s = let id, fresh = get_id s in if not fresh then id else let explore_arc l t () = let tid = pp_state t in pp ppf "@,%d -> %d [label=\"%a\"];" id tid pp_label l in if State.accepts s then pp ppf "@,{ node [shape=doublecircle]; %d}" id; State.fold_arcs explore_arc s (); id in pp ppf "@[digraph fsa@,{ @[rankdir=LR;\ @,node [shape = circle, label=\"\"];"; ignore (pp_state s); pp ppf "@]@,}@]"; () end (* Testing *) let log fmt = Format.eprintf (fmt ^^ "@.") let base_tests () = let a = Fsa.empty in let a = Fsa.add_word a "ab" in let a = Fsa.add_word a "" in let a = Fsa.add_word a "a" in let a = Fsa.add_word a "cdddd" in let a = Fsa.add_word a "b" in let a = Fsa.minimize a in assert (Fsa.accepts a ""); assert (Fsa.accepts a "ab"); assert (Fsa.accepts a "a"); assert (Fsa.accepts a "b"); assert (Fsa.accepts a "cdddd"); assert (not (Fsa.accepts a "c")); let i = Fsa.index a in assert (Fsa.index_word_count i = 5); assert (Fsa.find_index i "" = Some 0); assert (Fsa.find_index i "ab" = Some 2); assert (Fsa.find_index i "a" = Some 1); assert (Fsa.find_index i "b" = Some 3); assert (Fsa.find_index i "cdddd" = Some 4); assert (Fsa.find_index i "c" = None); assert (Fsa.find_word i 0 = Some ""); assert (Fsa.find_word i 1 = Some "a"); assert (Fsa.find_word i 2 = Some "ab"); assert (Fsa.find_word i 3 = Some "b"); assert (Fsa.find_word i 4 = Some "cdddd"); assert (Fsa.find_word i 5 = None); assert (Fsa.find_word i 6 = None); log "Base tests passed."; () let load_words ?(path = "/usr/share/dict/words") () = let module Strings = Set.Make (String) in let err e = Printf.eprintf "%s: %s" path e in let close ic = if path <> "-" then close_in ic in try let ic = if path = "-" then stdin else open_in_bin path in try let rec loop ws = match try Some (input_line ic) with End_of_file -> None with | Some w -> loop (Strings.add w ws) | None -> Array.of_list (Strings.elements ws) in let words = loop Strings.empty in close ic; words with Sys_error e -> if path <> "-" then close ic; err e; [||] with Sys_error e -> err e; [||] let test_words ?path () = let fail fmt = Printf.ksprintf (fun s -> failwith s) fmt in log "Loading words"; let words = load_words ?path () in log "Building finite state automaton"; let fsa = Array.fold_left Fsa.add_word Fsa.empty words in log "Building index (minimizing and tagging automaton)"; let idx = Fsa.index fsa in log "There are %d words in the index" (Fsa.index_word_count idx); let assert_word i w = begin match Fsa.find_index idx w with | None -> fail "no index for %S (%d)" w i | Some i' -> if i <> i' then fail "%S index exp:%d fnd:%d\n" w i i' end; begin match Fsa.find_word idx i with | None -> fail "no word found for %d (exp: %S)" i w | Some w' -> if w <> w' then fail "%d word exp:%S fnd:%S" i w w' end in log "Asserting indexer"; Array.iteri assert_word words; log "Dumping automaton to stdout"; Format.printf "%a" Fsa.pp_dot idx; () let main () = if !Sys.interactive then () else let path = if Array.length Sys.argv > 1 then Some (Sys.argv.(1)) else None in begin (* base_tests (); *) test_words ?path () end let () = main () (*--------------------------------------------------------------------------- Copyright (c) 2015 Daniel C. Bünzli. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of Daniel C. Bünzli nor the names of contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ---------------------------------------------------------------------------*)