Markov.java

/**
 * Markov.java
 * Copyright (c) 2009 Stimware.com
 */

package com.stimware.markov;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;

/**
 * This is a tool that uses Markov Chains to generate random names using patterns.
 * It will load a text file containing names, and build a set of rules
 * that indicate the probability that certain letters will appear in sequence.
 * This can then be used to generate names that will be completely random
 * and yet still follow a set of rules.
 */
public class Markov {

    private Map<String, Chain> chains = new HashMap();
    private Random random = new Random();

    // Only build chains from words this length or greater
    private int inputMinWordLength;
    
    // Length of chains to build
    private int chainLength;
    
    /**
     * Load Markov chains from a text file containing names.
     * @param file The text file to read
     * @param chainLength Length of chains to build
     * @param inputMinWordLength Only build chains from words this length or greater
     * @return The number of distinct chains read
     * @throws IOException if an error occurs reading text from the stream
     */
    public Markov(File file, int chainLength, int inputMinWordLength) throws IOException {
        InputStream in = null;
        try {
            in = new FileInputStream(file);
            load(in, chainLength, inputMinWordLength);
        } finally {
            if (in != null) {
                in.close();
            }
        }
    }

    /**
     * Load Markov chains from an input stream containing names.
     * @param in The input stream pointing to the text to read
     * @param chainLength Length of chains to build
     * @param inputMinWordLength Only build chains from words this length or greater
     * @return The number of distinct chains read
     * @throws IOException if an error occurs reading text from the stream
     */
    public Markov(InputStream in, int chainLength, int inputMinWordLength) throws IOException {
        load(in, chainLength, inputMinWordLength);
    }
    
    private void load(InputStream in, int chainLength, int inputMinWordLength) throws IOException {
        this.chainLength = chainLength;
        this.inputMinWordLength = inputMinWordLength;
        random.setSeed(System.nanoTime());
        BufferedReader reader = new BufferedReader(new InputStreamReader(in));
        for (String line = reader.readLine(); line != null; line = reader.readLine()) {
            processLine(line);
        }
    }

    /**
     * Get the number of distinct chains read after loading the file.
     * @return the number of distinct chains
     */
    public int getChainCount() {
        return chains.size();
    }
    
    /**
     * We actually process one word at a time, so need to split up incoming
     * lines into words.
     * @param line The line to process
     */
    private void processLine(String line) {
        
        // Convert anything like whitespace into a space
        line = line.replaceAll("\\?|\\.|,|!|;", " ");
        
        // Get all distinct "words"
        String[] words = line.split(" ");
        for (String word : words) {
            word = word.trim();
            if (word.length() >= inputMinWordLength) {
                word = word.substring(0, 1).toUpperCase() + word.substring(1).toLowerCase();
                processWord(word);
            }
        }
    }

    /**
     * Process a single word without any whitespace.
     * @param word The word to process
     */
    private void processWord(String word) {
        
        // Start the chain as the first character, including preceeding space
        word = " " + word + " ";
        
        // Each "chain" is a sequence of characters.
        // We start with a space and the first character, and build it from there
        String chain = word.substring(0, 2);

        // The chain moves along the length of the word one character at a time.
        for (int i = 1; i < word.length() - 1; i++) {
            processChain(chain);
            chain = incrementChain(chain, word.charAt(i + 1), chainLength + 1);
        }
        processChain(chain);
    }

    /**
     * A chain is a sequence of characters. We will make a rule from this,
     * stating that the final character in the sequence has a probability of
     * occurring after the previous characters
     * @param chainString 
     */
    private void processChain(String chainString) {
      
        // From this chain string, the rule set is the first n-1 characters.
        String sequence = chainString.substring(0, chainString.length() - 1);
        Chain chain = getChain(sequence);
        
        // The following character is added to the possible results for the chain.
        char next = chainString.charAt(chainString.length() - 1);
        chain.addNextChar(next);
    }
    
    /**
     * Get a new chain from the string, which is begins just after the previous chain.
     */
    private String incrementChain(String chainString, char next, int length) {
        chainString += next;
        int pos = chainString.length() - length;
        chainString = chainString.substring(pos < 0 ? 0 : pos);
        return chainString;
    }
    
    /**
     * Look up out list of chains already created. It is expected that certain
     * latter pairings (chains) will be encountered more than once.
     */
    private Chain getChain(String start) {
        Chain chain = chains.get(start);
        if (chain == null) {
            chain = new Chain();
            chains.put(start, chain);
        }
        return chain;
    }

    /**
     * Generate a string from our ruleset of chains.
     * @param wordLimit Maximum number of words in output
     * @param maxLengthSoft Length at which name generation will wind down, ie. stop after completion of current word
     * @param maxLengthHard Name generation will stop at this length, even if in mid-word
     * @return a randomly generated string
     */
    public String generate(int wordLimit, int maxLengthSoft, int maxLengthHard) {
        
        // Start from a space. This is important, because not all letters
        // are used to begin a name - only those known to follow a space.
        String chainString = " ";
        
        StringBuffer buffer = new StringBuffer(maxLengthHard);
        int wordCount = 0;
        boolean reachedMaxLengthSoft = false;
        
        // Keep going until we reach the hard limit. We may early out if the rules tell us to.
        for (int i = 0; i < maxLengthHard - 1; i++) {
            Chain chain = chains.get(chainString);
            if (chain == null) {
                // This should never happen, but just in case
                throw new IllegalStateException("No chain found: [" + chainString + "].\nThis means we have never seen this sequence of characters before, so cannot determine what could possibly follow.");
            }
            
            // This is where the magic happens - getting the next character.
            char next = chain.getNextChar();
            
            // The next character might be a space that divides names,
            // so need to check if we have reached the word limit.
            // If the soft limit has already been reached then we finish.
            if (next == ' ') {
                wordCount++;
                chainString = " ";
                if (wordCount >= wordLimit || reachedMaxLengthSoft) {
                    return buffer.toString();
                }
            } else {
                chainString = incrementChain(chainString, next, chainLength);
            }
            buffer.append(next);
            if (i > maxLengthSoft) {
                // Don't finish just yet, we will do so at the end of the next word
                reachedMaxLengthSoft = true;
            }
        }
        
        return buffer.toString();
    }
    
    /**
     * This represents a sequence of characters that can occur. There is no
     * ordering to the list, and certain characters may appear in the list more
     * than once if they have a higher probability of occurring. This is a simple
     * and effective way of assigning a weighting to each possible character.
     */
    private class Chain {
        
        private List<Character> nextList = new ArrayList();

        /**
         * This is where we register that we have seen a character occur in a sequence.
         * For any character this can happen more than once, which adds to its weight.
         * The more times a character has appeared after an initial sequence, the
         * greater probability it will appear when calling getNextChar below.
         * @param nextChar the character that might follow in a sequence
         */
        void addNextChar(char nextChar) {
            nextList.add(Character.valueOf(nextChar));
        }

        /**
         * Get the next character from a list of possible characters.
         * @return the next character
         */
        char getNextChar() {
            int index = random.nextInt(nextList.size());
            return nextList.get(index);
        }
    }
}