001 (ns dj-marky-markov.core
002 "A simple Markov Chain for text generation in Clojure"
003 (:require [clojure.string :as cs])
004 (:gen-class))
005
006 (def first-capital-letter-regex
007 "Regex pattern to find words that start sentences"
008 (re-pattern "^[A-Z]"))
009
010 (def punctuation-regex
011 "This is a loose regex detection for string terminators. Needs to be more robust"
012 (re-pattern "[\\.\\?\\!]"))
013
014 (defn concat-with-space
015 "Return `s1` concatenated with `s2` with a space between the two strings"
016 [s1 s2]
017 (str s1 " " s2))
018
019 (defn search-text
020 "Take `window-length` words from the end of `sentence`
021 to use as the look-up for the next markov transition"
022 [stem window-length]
023 (let [trailing-part (take-last window-length (cs/split stem #"\s"))]
024 (cs/trim (reduce concat-with-space trailing-part))))
025
026 (defn string-to-sliding-window
027 "Convert `string` into a series of sliding windows that will be sliced into
028 a window and a transition word, hence incrementing the partition window."
029 [string window-length]
030 (partition (inc window-length) 1 (cs/split string #"\s+")))
031
032 (defn single-window-to-tuple
033 "Split `window` into a look-up and a transition tuple"
034 [window]
035 (cons (reduce concat-with-space (butlast window)) (list (last window))))
036
037 (defn add-entry
038 "Update `dictionary` with an `entry` tuple.
039 If the look-up already exists, cons the current transition to the list of transitions"
040 [dictionary entry]
041 (update dictionary (first entry) #(cons (second entry) %)))
042
043 (defn build-markov-dictionary
044 "Collapse `tuple-set` into a look-up dictionary"
045 [tuple-set]
046 (reduce add-entry {} tuple-set))
047
048 (defn starts-sentence?
049 "Boolean predicate that returns true iff the first word of `text-window`
050 starts with a capital letter"
051 [text-window]
052 (boolean (re-find first-capital-letter-regex (first text-window))))
053
054 (defn sentence-ended?
055 "Boolean predicate that returns true iff `text` contains terminal punctuation"
056 [text]
057 (boolean (re-find punctuation-regex text)))
058
059 (defn markov-sentence
060 "Build a sentence starting with a random element in `sentence-starters`, and
061 recursively navigate the markov model in `sentence-bodies`, whose transition windows
062 are sized by `window-length`"
063 [sentence-starters sentence-bodies window-length]
064 (loop [sentence (rand-nth (keys sentence-starters))]
065 (if (sentence-ended? sentence)
066 ;; We want to trim off anything that trails after punctuation
067 ;; e.g. "I went to the store. Hello" -> "I went to the store."
068 (str (first (cs/split sentence punctuation-regex))
069 (re-find punctuation-regex sentence))
070 (let [look-up (search-text sentence window-length)
071 added-text (rand-nth (get sentence-bodies look-up ["."]))]
072 (recur (concat-with-space sentence added-text))))))
073
074 (defn markov-sentences
075 "Generate `sentences` repeatedly"
076 [sentence-starters sentence-bodies window-length sentences]
077 (take sentences (repeatedly #(markov-sentence sentence-starters sentence-bodies window-length))))
078
079 (defn load-data!
080 "Load the file at `path` and split it into tuples of size `window-length`"
081 [path window-length]
082 (map single-window-to-tuple (string-to-sliding-window (slurp path) window-length)))
083
084 (defn write-sentences
085 "Generate `copies` sentences from the `tuples` where each transition window is `window-length`"
086 [tuples window-length copies]
087 (let [sentence-starters (build-markov-dictionary (get (group-by starts-sentence? tuples) true))
088 sentence-bodies (build-markov-dictionary tuples)]
089 (markov-sentences sentence-starters sentence-bodies window-length copies)))
090
091 (defn -main
092 "Try me out!"
093 [& args]
094 (let [window-length (Integer/parseInt (second args))
095 tuples (load-data! (first args) window-length)
096 results-to-generate (Integer/parseInt (nth args 2))
097 generated-text (write-sentences tuples window-length results-to-generate)]
098 (doseq [sentence generated-text] (println sentence))))