gesticulations 001

2017-01-25

Tap and drag to input strokes. Use backspace or double-tap to clear the gesture.

The $P multistroke gesture recognizer is a simple prototyping algorithm to recognize gesticulating humans. You can test it on the gray surface above with your mouse or fingers to recognize the symbols on the right. The following is a short explanation and straightforward OCaml implementation of $P.

The idea behind $P is to only consider a gesture's underlying point set. Discarding the information about the user input time line and directions makes it easier to recognize a symbol regardless of how it has been drawn by the user: with a single stroke, with multiple strokes, from left to right, from right to left, etc.

Given a set of gesture templates T, the task of the recognizer is to find the best element t of T that matches a user input gesture g. Seeing gestures as point sets, this calls for a notion of matching between point sets and a matching metric to evaluate its fitness.

A matching function between point sets g and t is an injective map m : g -> t that associates each point of g to a distinct point of t. Assuming gestures and templates have the same number n of points, which can been achieved by resampling, we define a matching m as a bijective map between g and t.

To evaluate the fitness of m our matching metric sums the euclidian distance between the points of g and t matched by m:

Σ ‖g(i) - m(g(i))‖ with i in [0;n-1] and g(i) the ith point of g.

The smaller this matching distance, the better the matching. We define the distance between two gestures g and t as being the smallest matching distance of all the n! matchings between g and t.

Now to match a user input g against a set of gesture templates T we compute for each template in T the distance to g and take the one yielding the smallest value as being the recognized template for g in T; a 1-nearest neighbor classification.

This is the high-level view. Making this simple to implement and computationally practical requires considerations and trade-offs that were investigated by $P's designers and whose conclusions are described next.

We start by defining our API. The signature below should be mostly self-explaining.

open Gg

module Gest : sig

  type stroke = p2 list

  type t (* Gestures *)
  val of_strokes : stroke list -> t

  type 'a reco (* Recognizers for symbols of type ['a] *)
  val empty_reco : 'a reco
  val add_sym : t -> 'a -> 'a reco -> 'a reco
  val sym_gests : 'a reco -> 'a -> t list
  val recognize : 'a reco -> t -> 'a option * float (* score *)
  
end = struct

  type stroke = p2 list

The $P matching algorithm is easier to write with point set indexing so we represent gestures as array of points of fixed length n.

  let n = 32
  let n_float = float n
  type t = p2 array (* of length [n] *)

All gestures have the same number n of points. The resampling function takes a raw gesture, represented by a list of strokes, and tries to resample them to a point set with n mostly equidistant points spread on the strokes. $P's resampling algorithm doesn't work on single point strokes (taps); we massage a bit the data to remove them.

  let stroke_len : stroke -> float =
  fun ps ->
    let rec loop l prev = function
    | [] -> l
    | p :: ps -> loop (l +. V2.(norm @@ p - prev)) p ps
    in
    if ps = [] then 0. else loop 0. (List.hd ps) (List.tl ps)

  let resample : stroke list -> t =
  fun ss ->
    let rec massage len acc = function
    | [] :: ss -> massage len acc ss
    | [tap] :: ss -> massage len acc ss
    | s :: ss -> massage (len +. stroke_len s) (s :: acc) ss
    | [] -> len, acc
    in
    let len, ss = massage 0. [] ss in
    let ival = len /. (n_float -. 1.) in
    let first = match ss with (p :: _) :: _ -> p | _ -> P2.o in
    let g = Array.make n first in
    let rec loop d last ss i =
      if i >= n then g else
      match ss with
      | [] -> g
      | [] :: (p :: ps) :: ss -> (* start *) loop 0. p (ps::ss) i
      | [] :: ss -> (* end *) loop 0. last ss i
      | (p :: ps) :: ss' ->
          let walk = V2.(p - last) in
          let dist = V2.norm walk in
          let d' = d +. dist in
          match d' < ival with
          | true -> loop d' p (ps :: ss') i
          | false ->
              let t = (ival -. d) /. dist in
              let q = V2.(last + t * walk) in
              g.(i) <- q; loop 0. q ss (i + 1)
    in
    loop 0. first ss 1

$P's recognition is insensitive to scale and position. For this we first map a gesture to the unit square [0;1]×[0;1] with a, uniform, shape preserving scaling transform that maps the largest axis-aligned extent of the point set to the unit. Then we follow by translating the centroid of the point set to the origin. We perform these normalizations in place.

  let to_unit_square : t -> unit =
  fun g ->
    let pmin, max_extent =
      let rec loop xmin xmax ymin ymax g i = match i < n with
      | false ->
          (P2.v xmin ymin), max (xmax -. xmin) (ymax -. ymin)
      | true ->
          let px, py = P2.x g.(i), P2.y g.(i) in
          loop (min xmin px) (max xmax px)
               (min ymin py) (max ymax py) g (i + 1)
      in
      loop infinity neg_infinity infinity neg_infinity g 0
    in
    if max_extent = 0. then () (* avoid degen div by zero *) else
    let rescale = 1. /. max_extent in
    let rescale p = V2.(rescale * (p - pmin)) in
    Array.iteri (fun i p -> g.(i) <- rescale p) g

  let centroid : t -> P2.t =
  fun g ->
    let rec loop cx cy g i = match i < n with
    | false -> V2.v (cx /. n_float) (cy /. n_float)
    | true ->
        let p = g.(i) in
        loop (cx +. V2.x p) (cy +. V2.y p) g (i + 1)
    in
    loop 0. 0. g 0

  let center : t -> unit =
  fun g ->
    let c = centroid g in
    Array.iteri (fun i p -> g.(i) <- V2.(p - c)) g

We are now ready to convert our raw gestures to gesture values.

  let of_strokes : stroke list -> t =
  fun ss ->
    let g = resample ss in
    (to_unit_square g; center g; g)

Gesture values being properly normalized we proceed to compute the distance between two of them. As mentioned above this means finding the matching with the smallest matching distance among the n! possible matching functions.

This is an instance of the combinatorial optimization assignment problem for which the Hungarian Method provides an O(n³) algorithm. From the point of view of $P's designers this algorithm is too expensive both from a time and implementation complexity perspective. As such $P implements a simpler O(n^2.5) heuristic algorithm that doesn't yield the optimal matching but provides good empirical results.

The algorithm matches points between g and t as follows. Starting with the first point of g it finds the closest point in t and takes it as the matching point. It then proceeds with the next point of g and selects the closest point in t that is not yet matched and so on until we get to the final point of g for which there's no choice but a single remaining point of t to match with.

We do not generate the list of actual matching pairs as we are only interested in the matching distance. Instead we directly compute the distance between matched points and sum it on the fly. During this process the algorithm weights the distance between matched points by a value in [0;1]. The weight expresses a degree of confidence in the point match. The first point of g has a match confidence w(0) of 1 since it is compared to all the points of the template. Subsequent points do not see all the points of the template so the match confidence w(i) decreases according to the following formula:

w(i) = 1 - (i / n) with i in [0;n-1]

The final result of this computation is sensitive to the starting point in g and the order in which points are considered. To counteract this, the procedure is run with n^ϵ (with ϵ in [0;1], 0.5 in practice) different starting points of g and the smallest resulting value is kept — the order is however not altered.

This computation is also asymmetric, so we measure g against t and t against g and keep again the smallest value.

  let gest_dist_at : start:int -> t -> t -> float =
  fun ~start g t -> (* distance by starting at [start] in [g]. *)
    let matched = Array.make n false in
    let match_dmin p t = (* unmatched closest to [p] in [t]. *)
      let rec loop dmin_idx dmin k = match k < n with
      | false -> dmin_idx, dmin
      | true ->
          let next = k + 1 in
          if matched.(k) then loop dmin_idx dmin next else
          let d = V2.(norm @@ p - t.(k)) in
          match d < dmin with
          | true -> loop k d next
          | false -> loop dmin_idx dmin next
      in
      let dmin_idx, dmin = loop (-1) infinity 0 in
      (matched.(dmin_idx) <- true; dmin)
    in
    let rec loop sum i =
      let dmin = match_dmin g.(i) t in
      let w = 1. -. (float @@ (i - start + n) mod n) /. n_float in
      let sum = sum +. w *. dmin in
      let next = (i + 1) mod n in
      if next = start then sum else loop sum next
    in
    loop 0. start

  let gest_dist : t -> t -> float =
    let eps = 0.5 in
    let step = truncate (n_float ** (1. -. eps)) in
    fun g t ->
      let rec loop dmin i = match i < n with
      | false -> dmin
      | true ->
          let d0 = gest_dist_at ~start:i g t in
          let d1 = gest_dist_at ~start:i t g in
          loop (min d0 d1) (i + step)
      in
      loop infinity 0

We fill in the trivial bits of the recognizer type:

  type 'a reco = (t * 'a) list
  let empty_reco = []
  let add_sym t sym r = (t, sym) :: r
  let sym_gests r sym =
    List.(map fst @@ find_all (fun (_, sym') -> sym = sym') r)

The function to recognize the symbol associated to an input gesture simply selects the symbol of the template with the smallest distance. It also returns a score in the range [0;1] with 1 denoting a perfect match.

  let recognize : 'a reco -> t -> 'a option * float =
  fun r g ->
    let score dmin = max (0.5 *. (2. -. dmin)) 0. in
    let rec loop fnd dmin = function
    | [] -> fnd, score dmin
    | (t, sym) :: ts ->
        let d = gest_dist g t in
        match d < dmin with
        | true -> loop (Some sym) d ts
        | false -> loop fnd dmin ts
    in
    loop None infinity r
    
end

These are the only software gesticulations we needed. The $P gesture recognizer is described and analyzed in this paper; empirical usage guidelines such as number of symbols and adequate number of templates per symbol are also provided. Related information and works can be found on the algorithm's homepage.

Full source (gist) with self-contained webpage of the above interface.