public static double[][] extractBigrams( File file, MapSymbolTable symbolTable, TokenizerFactory tokenizerFactory, String charset) throws Exception { char[] cs = Files.readCharsFromFile(file, charset); String[] tokens = tokenizerFactory.tokenizer(cs, 0, cs.length).tokenize(); System.out.println(" Number of tokens=" + tokens.length); int[] symbols = new int[tokens.length]; for (int i = 0; i < tokens.length; ++i) { symbols[i] = Strings.allLetters(tokens[i].toCharArray()) ? symbolTable.getOrAddSymbol(tokens[i]) : -1; } int numSymbols = symbolTable.numSymbols(); System.out.println(" Number of distinct tokens=" + numSymbols); System.out.println(" #Matrix entries=" + numSymbols * numSymbols); double[][] values = new double[numSymbols][numSymbols]; for (int i = 0; i < numSymbols; ++i) Arrays.fill(values[i], 0.0); for (int i = 1; i < symbols.length; ++i) { int left = symbols[i - 1]; int right = symbols[i]; if (left >= 0 && right >= 0) values[symbols[i - 1]][symbols[i]] += 1.0; } return values; }
public static void reportSvd(double[][] values, SvdMatrix matrix, MapSymbolTable symbolTable) { double[] singularValues = matrix.singularValues(); double[][] leftSingularVectors = matrix.leftSingularVectors(); double[][] rightSingularVectors = matrix.rightSingularVectors(); for (int order = 0; order < singularValues.length; ++order) { System.out.println("\n\nORDER=" + order + " singular value=" + singularValues[order]); System.out.println("Extreme Left Values"); extremeValues(leftSingularVectors, order, symbolTable); System.out.println("\nExtreme Right Values"); extremeValues(rightSingularVectors, order, symbolTable); } ObjectToDoubleMap<String> topPairCounts = new ObjectToDoubleMap<String>(); int numSymbols = symbolTable.numSymbols(); for (int i = 0; i < numSymbols; ++i) { for (int j = 0; j < numSymbols; ++j) { if (values[i][j] != 0) { topPairCounts.set( symbolTable.idToSymbol(i) + "," + symbolTable.idToSymbol(j), values[i][j]); } } } int numPairs = topPairCounts.size(); System.out.println("#unique pairs=" + numPairs); List<String> pairsByCount = topPairCounts.keysOrderedByValueList(); for (int i = 0; i < 25; ++i) { String pair = pairsByCount.get(i); System.out.println(" " + pair + " count=" + topPairCounts.getValue(pair)); } System.out.println("\nRECONSTRUCTED TOP COUNTS"); System.out.println("LeftToken,RightToken OriginalValue SvdValue"); for (int i = 0; i < 25; ++i) { String pair = pairsByCount.get(i); String[] tokenPair = pair.split(","); String leftToken = tokenPair[0]; String rightToken = tokenPair[1]; int leftSymbol = symbolTable.symbolToID(leftToken); int rightSymbol = symbolTable.symbolToID(rightToken); double originalValue = topPairCounts.getValue(pair); double reconstructedValue = matrix.value(leftSymbol, rightSymbol); System.out.println(pair + " " + originalValue + " " + reconstructedValue); } }
public static void extremeValues(double[][] values, int order, MapSymbolTable symbolTable) { ObjectToDoubleMap<String> topVals = new ObjectToDoubleMap<String>(); for (int i = 0; i < values.length; ++i) { String token = symbolTable.idToSymbol(i); topVals.set(token, values[i][order]); } List<String> tokensByValue = topVals.keysOrderedByValueList(); int size = tokensByValue.size(); for (int i = 0; i < 10 && i < size; ++i) { String token = tokensByValue.get(i); double value = topVals.getValue(token); System.out.printf(" %6d %-15s % 5.3f\n", i, token, value); } System.out.println("..."); for (int i = 10; --i >= 0; ) { String token = tokensByValue.get(size - i - 1); double value = topVals.getValue(token); System.out.printf(" %6d %-15s % 5.3f\n", size - i - 1, token, value); } }