public static double[][] extractBigrams( File file, MapSymbolTable symbolTable, TokenizerFactory tokenizerFactory, String charset) throws Exception { char[] cs = Files.readCharsFromFile(file, charset); String[] tokens = tokenizerFactory.tokenizer(cs, 0, cs.length).tokenize(); System.out.println(" Number of tokens=" + tokens.length); int[] symbols = new int[tokens.length]; for (int i = 0; i < tokens.length; ++i) { symbols[i] = Strings.allLetters(tokens[i].toCharArray()) ? symbolTable.getOrAddSymbol(tokens[i]) : -1; } int numSymbols = symbolTable.numSymbols(); System.out.println(" Number of distinct tokens=" + numSymbols); System.out.println(" #Matrix entries=" + numSymbols * numSymbols); double[][] values = new double[numSymbols][numSymbols]; for (int i = 0; i < numSymbols; ++i) Arrays.fill(values[i], 0.0); for (int i = 1; i < symbols.length; ++i) { int left = symbols[i - 1]; int right = symbols[i]; if (left >= 0 && right >= 0) values[symbols[i - 1]][symbols[i]] += 1.0; } return values; }