コード例 #1
0
  public static double[][] extractBigrams(
      File file, MapSymbolTable symbolTable, TokenizerFactory tokenizerFactory, String charset)
      throws Exception {

    char[] cs = Files.readCharsFromFile(file, charset);
    String[] tokens = tokenizerFactory.tokenizer(cs, 0, cs.length).tokenize();
    System.out.println("    Number of tokens=" + tokens.length);

    int[] symbols = new int[tokens.length];
    for (int i = 0; i < tokens.length; ++i) {
      symbols[i] =
          Strings.allLetters(tokens[i].toCharArray()) ? symbolTable.getOrAddSymbol(tokens[i]) : -1;
    }

    int numSymbols = symbolTable.numSymbols();
    System.out.println("    Number of distinct tokens=" + numSymbols);
    System.out.println("    #Matrix entries=" + numSymbols * numSymbols);

    double[][] values = new double[numSymbols][numSymbols];
    for (int i = 0; i < numSymbols; ++i) Arrays.fill(values[i], 0.0);

    for (int i = 1; i < symbols.length; ++i) {
      int left = symbols[i - 1];
      int right = symbols[i];
      if (left >= 0 && right >= 0) values[symbols[i - 1]][symbols[i]] += 1.0;
    }

    return values;
  }
コード例 #2
0
  public static void reportSvd(double[][] values, SvdMatrix matrix, MapSymbolTable symbolTable) {
    double[] singularValues = matrix.singularValues();
    double[][] leftSingularVectors = matrix.leftSingularVectors();
    double[][] rightSingularVectors = matrix.rightSingularVectors();
    for (int order = 0; order < singularValues.length; ++order) {
      System.out.println("\n\nORDER=" + order + " singular value=" + singularValues[order]);

      System.out.println("Extreme Left Values");
      extremeValues(leftSingularVectors, order, symbolTable);

      System.out.println("\nExtreme Right Values");
      extremeValues(rightSingularVectors, order, symbolTable);
    }

    ObjectToDoubleMap<String> topPairCounts = new ObjectToDoubleMap<String>();
    int numSymbols = symbolTable.numSymbols();
    for (int i = 0; i < numSymbols; ++i) {
      for (int j = 0; j < numSymbols; ++j) {
        if (values[i][j] != 0) {
          topPairCounts.set(
              symbolTable.idToSymbol(i) + "," + symbolTable.idToSymbol(j), values[i][j]);
        }
      }
    }
    int numPairs = topPairCounts.size();
    System.out.println("#unique pairs=" + numPairs);
    List<String> pairsByCount = topPairCounts.keysOrderedByValueList();
    for (int i = 0; i < 25; ++i) {
      String pair = pairsByCount.get(i);
      System.out.println("     " + pair + " count=" + topPairCounts.getValue(pair));
    }

    System.out.println("\nRECONSTRUCTED TOP COUNTS");
    System.out.println("LeftToken,RightToken OriginalValue SvdValue");
    for (int i = 0; i < 25; ++i) {
      String pair = pairsByCount.get(i);
      String[] tokenPair = pair.split(",");
      String leftToken = tokenPair[0];
      String rightToken = tokenPair[1];
      int leftSymbol = symbolTable.symbolToID(leftToken);
      int rightSymbol = symbolTable.symbolToID(rightToken);
      double originalValue = topPairCounts.getValue(pair);
      double reconstructedValue = matrix.value(leftSymbol, rightSymbol);
      System.out.println(pair + "  " + originalValue + "  " + reconstructedValue);
    }
  }
コード例 #3
0
 public static void extremeValues(double[][] values, int order, MapSymbolTable symbolTable) {
   ObjectToDoubleMap<String> topVals = new ObjectToDoubleMap<String>();
   for (int i = 0; i < values.length; ++i) {
     String token = symbolTable.idToSymbol(i);
     topVals.set(token, values[i][order]);
   }
   List<String> tokensByValue = topVals.keysOrderedByValueList();
   int size = tokensByValue.size();
   for (int i = 0; i < 10 && i < size; ++i) {
     String token = tokensByValue.get(i);
     double value = topVals.getValue(token);
     System.out.printf("     %6d %-15s % 5.3f\n", i, token, value);
   }
   System.out.println("...");
   for (int i = 10; --i >= 0; ) {
     String token = tokensByValue.get(size - i - 1);
     double value = topVals.getValue(token);
     System.out.printf("     %6d %-15s % 5.3f\n", size - i - 1, token, value);
   }
 }