Markov.java
/**
* Markov.java
* Copyright (c) 2009 Stimware.com
*/
package com.stimware.markov;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
/**
* This is a tool that uses Markov Chains to generate random names using patterns.
* It will load a text file containing names, and build a set of rules
* that indicate the probability that certain letters will appear in sequence.
* This can then be used to generate names that will be completely random
* and yet still follow a set of rules.
*/
public class Markov {
private Map<String, Chain> chains = new HashMap();
private Random random = new Random();
// Only build chains from words this length or greater
private int inputMinWordLength;
// Length of chains to build
private int chainLength;
/**
* Load Markov chains from a text file containing names.
* @param file The text file to read
* @param chainLength Length of chains to build
* @param inputMinWordLength Only build chains from words this length or greater
* @return The number of distinct chains read
* @throws IOException if an error occurs reading text from the stream
*/
public Markov(File file, int chainLength, int inputMinWordLength) throws IOException {
InputStream in = null;
try {
in = new FileInputStream(file);
load(in, chainLength, inputMinWordLength);
} finally {
if (in != null) {
in.close();
}
}
}
/**
* Load Markov chains from an input stream containing names.
* @param in The input stream pointing to the text to read
* @param chainLength Length of chains to build
* @param inputMinWordLength Only build chains from words this length or greater
* @return The number of distinct chains read
* @throws IOException if an error occurs reading text from the stream
*/
public Markov(InputStream in, int chainLength, int inputMinWordLength) throws IOException {
load(in, chainLength, inputMinWordLength);
}
private void load(InputStream in, int chainLength, int inputMinWordLength) throws IOException {
this.chainLength = chainLength;
this.inputMinWordLength = inputMinWordLength;
random.setSeed(System.nanoTime());
BufferedReader reader = new BufferedReader(new InputStreamReader(in));
for (String line = reader.readLine(); line != null; line = reader.readLine()) {
processLine(line);
}
}
/**
* Get the number of distinct chains read after loading the file.
* @return the number of distinct chains
*/
public int getChainCount() {
return chains.size();
}
/**
* We actually process one word at a time, so need to split up incoming
* lines into words.
* @param line The line to process
*/
private void processLine(String line) {
// Convert anything like whitespace into a space
line = line.replaceAll("\\?|\\.|,|!|;", " ");
// Get all distinct "words"
String[] words = line.split(" ");
for (String word : words) {
word = word.trim();
if (word.length() >= inputMinWordLength) {
word = word.substring(0, 1).toUpperCase() + word.substring(1).toLowerCase();
processWord(word);
}
}
}
/**
* Process a single word without any whitespace.
* @param word The word to process
*/
private void processWord(String word) {
// Start the chain as the first character, including preceeding space
word = " " + word + " ";
// Each "chain" is a sequence of characters.
// We start with a space and the first character, and build it from there
String chain = word.substring(0, 2);
// The chain moves along the length of the word one character at a time.
for (int i = 1; i < word.length() - 1; i++) {
processChain(chain);
chain = incrementChain(chain, word.charAt(i + 1), chainLength + 1);
}
processChain(chain);
}
/**
* A chain is a sequence of characters. We will make a rule from this,
* stating that the final character in the sequence has a probability of
* occurring after the previous characters
* @param chainString
*/
private void processChain(String chainString) {
// From this chain string, the rule set is the first n-1 characters.
String sequence = chainString.substring(0, chainString.length() - 1);
Chain chain = getChain(sequence);
// The following character is added to the possible results for the chain.
char next = chainString.charAt(chainString.length() - 1);
chain.addNextChar(next);
}
/**
* Get a new chain from the string, which is begins just after the previous chain.
*/
private String incrementChain(String chainString, char next, int length) {
chainString += next;
int pos = chainString.length() - length;
chainString = chainString.substring(pos < 0 ? 0 : pos);
return chainString;
}
/**
* Look up out list of chains already created. It is expected that certain
* latter pairings (chains) will be encountered more than once.
*/
private Chain getChain(String start) {
Chain chain = chains.get(start);
if (chain == null) {
chain = new Chain();
chains.put(start, chain);
}
return chain;
}
/**
* Generate a string from our ruleset of chains.
* @param wordLimit Maximum number of words in output
* @param maxLengthSoft Length at which name generation will wind down, ie. stop after completion of current word
* @param maxLengthHard Name generation will stop at this length, even if in mid-word
* @return a randomly generated string
*/
public String generate(int wordLimit, int maxLengthSoft, int maxLengthHard) {
// Start from a space. This is important, because not all letters
// are used to begin a name - only those known to follow a space.
String chainString = " ";
StringBuffer buffer = new StringBuffer(maxLengthHard);
int wordCount = 0;
boolean reachedMaxLengthSoft = false;
// Keep going until we reach the hard limit. We may early out if the rules tell us to.
for (int i = 0; i < maxLengthHard - 1; i++) {
Chain chain = chains.get(chainString);
if (chain == null) {
// This should never happen, but just in case
throw new IllegalStateException("No chain found: [" + chainString + "].\nThis means we have never seen this sequence of characters before, so cannot determine what could possibly follow.");
}
// This is where the magic happens - getting the next character.
char next = chain.getNextChar();
// The next character might be a space that divides names,
// so need to check if we have reached the word limit.
// If the soft limit has already been reached then we finish.
if (next == ' ') {
wordCount++;
chainString = " ";
if (wordCount >= wordLimit || reachedMaxLengthSoft) {
return buffer.toString();
}
} else {
chainString = incrementChain(chainString, next, chainLength);
}
buffer.append(next);
if (i > maxLengthSoft) {
// Don't finish just yet, we will do so at the end of the next word
reachedMaxLengthSoft = true;
}
}
return buffer.toString();
}
/**
* This represents a sequence of characters that can occur. There is no
* ordering to the list, and certain characters may appear in the list more
* than once if they have a higher probability of occurring. This is a simple
* and effective way of assigning a weighting to each possible character.
*/
private class Chain {
private List<Character> nextList = new ArrayList();
/**
* This is where we register that we have seen a character occur in a sequence.
* For any character this can happen more than once, which adds to its weight.
* The more times a character has appeared after an initial sequence, the
* greater probability it will appear when calling getNextChar below.
* @param nextChar the character that might follow in a sequence
*/
void addNextChar(char nextChar) {
nextList.add(Character.valueOf(nextChar));
}
/**
* Get the next character from a list of possible characters.
* @return the next character
*/
char getNextChar() {
int index = random.nextInt(nextList.size());
return nextList.get(index);
}
}
}