Skip to content

Instantly share code, notes, and snippets.

@stuartsierra
Created July 9, 2013 01:47
Show Gist options
  • Save stuartsierra/5954022 to your computer and use it in GitHub Desktop.
Save stuartsierra/5954022 to your computer and use it in GitHub Desktop.
Example implementation of Norvig's Spellchecker in Clojure, using core.async
;; Example implementation of Norvig's Spellchecker in Clojure,
;; using core.async
;;
;; There are probably some bugs in this.
;;
;; Original problem: https://github.com/ericnormand/spelling-jam
;; from Lambda Jam, Chicago, 2013: http://lambdajam.com/
;;
;; Clojure core.async introduction:
;; http://clojure.com/blog/2013/06/28/clojure-core-async-channels.html
(ns spellcheck
(:require [clojure.string :as str]
[clojure.core.async :refer :all]))
(defn word-freqs [words]
(let [c (double (count words))]
(reduce-kv (fn [m k v]
(assoc m k (/ (double v) c)))
{}
(frequencies words))))
(def alphabet (map char (range (int \a) (inc (int \z)))))
;; From Alex Miller,
;; https://github.com/puredanger/spelling-corrector/blob/master/src/spelling/core.clj
(defn edits1 [word]
(let [splits (for [i (range (inc (count word)))] [(subs word 0 i) (subs word i)])
deletes (for [[a b] splits :when (not (str/blank? b))] (str a (subs b 1)))
transposes (for [[a b] splits :when (> (count b) 1)] (str a (second b) (first b) (subs b 2)))
replaces (for [[a b] splits :when (> (count b) 0)
c alphabet]
(str a c (subs b 1)))
inserts (for [[a b] splits
c alphabet]
(str a c b))]
(set (concat deletes transposes replaces inserts))))
(def max-working-set-size
"Hard limit to constrain the size of the search space for possible
words. Prevents the lazy sequence from trying forever to find the
next possible correction."
1000)
(defn corrections
"Given a dictionary (map of word frequencies) and a collection of
corrections being considered, returns an infinite lazy sequence of
corrections."
[dict ws]
(when (< (count ws) max-working-set-size)
(lazy-seq
(let [es (mapcat edits1 ws)
ws (->> es
(filter #(contains? dict %))
(sort-by (comp - dict)))]
(concat ws (corrections dict es))))))
(defn read-corpus
"Returns a sequence of words read from a file."
[file]
(re-seq #"[a-z]+" (clojure.string/lower-case (slurp file))))
(def corpus-words
(delay (read-corpus
"big.txt")))
(def corpus-freqs
(delay (word-freqs @corpus-words)))
(defn correct
"Returns a lazy sequence of possible corrections for word."
[word]
(distinct
(concat (when (contains? @corpus-freqs word) [word])
(corrections @corpus-freqs [word]))))
(defn chan-seq
"Returns a channel on which successive elements of the lazy sequence
s will be put."
[s]
(let [out (chan)]
(go (loop [ss s]
(when-first [x ss]
(>! out x)
(recur (rest ss)))))
out))
(defn corr-chan
"Returns a channel on which possible spelling corrections for word
will be written."
[word]
(chan-seq (correct word)))
(defn wait-max
"Returns a channel which receives successive values from channel ch.
If any element takes longer than msec milliseconds to produce,
closes the channel."
[msec ch]
(let [out (chan)]
(go (loop []
(alt! ch ([x] (if x
(do (>! out x) (recur))
(close! out)))
(timeout msec) (close! out))))
out))
(defn take-pr
"Reads and prints (to Java standard output) successive values from
channel ch until it is closed."
[ch]
(go (loop []
(when-let [x (<! ch)]
(binding [*out* (java.io.PrintWriter. System/out)]
(prn x))
(recur)))))
(defn correcting
"Searches for possible corrections to word and prints them to Java
standard output. Takes at most timeout milliseconds to generate a
correction."
[timeout word]
(take-pr (wait-max timeout (corr-chan word))))
(comment
;; Usage examples.
;; If you're using this from a REPL client such as Emacs/nREPL,
;; remember that the output will appear in the STDOUT of the Java
;; process, not your REPL.
(correcting 5000 "colour")
;; "colour"
;; "color"
;; "colours"
;; "colon"
;; "colors"
;; "colony"
(correcting 5000 "milenium")
;; "selenium"
;; "millennium"
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment