Ejemplo n.º 1
0
  public static void reportSvd(double[][] values, SvdMatrix matrix, MapSymbolTable symbolTable) {
    double[] singularValues = matrix.singularValues();
    double[][] leftSingularVectors = matrix.leftSingularVectors();
    double[][] rightSingularVectors = matrix.rightSingularVectors();
    for (int order = 0; order < singularValues.length; ++order) {
      System.out.println("\n\nORDER=" + order + " singular value=" + singularValues[order]);

      System.out.println("Extreme Left Values");
      extremeValues(leftSingularVectors, order, symbolTable);

      System.out.println("\nExtreme Right Values");
      extremeValues(rightSingularVectors, order, symbolTable);
    }

    ObjectToDoubleMap<String> topPairCounts = new ObjectToDoubleMap<String>();
    int numSymbols = symbolTable.numSymbols();
    for (int i = 0; i < numSymbols; ++i) {
      for (int j = 0; j < numSymbols; ++j) {
        if (values[i][j] != 0) {
          topPairCounts.set(
              symbolTable.idToSymbol(i) + "," + symbolTable.idToSymbol(j), values[i][j]);
        }
      }
    }
    int numPairs = topPairCounts.size();
    System.out.println("#unique pairs=" + numPairs);
    List<String> pairsByCount = topPairCounts.keysOrderedByValueList();
    for (int i = 0; i < 25; ++i) {
      String pair = pairsByCount.get(i);
      System.out.println("     " + pair + " count=" + topPairCounts.getValue(pair));
    }

    System.out.println("\nRECONSTRUCTED TOP COUNTS");
    System.out.println("LeftToken,RightToken OriginalValue SvdValue");
    for (int i = 0; i < 25; ++i) {
      String pair = pairsByCount.get(i);
      String[] tokenPair = pair.split(",");
      String leftToken = tokenPair[0];
      String rightToken = tokenPair[1];
      int leftSymbol = symbolTable.symbolToID(leftToken);
      int rightSymbol = symbolTable.symbolToID(rightToken);
      double originalValue = topPairCounts.getValue(pair);
      double reconstructedValue = matrix.value(leftSymbol, rightSymbol);
      System.out.println(pair + "  " + originalValue + "  " + reconstructedValue);
    }
  }
Ejemplo n.º 2
0
  public static void main(String[] args) throws Exception {
    System.out.println("TokenBigramSVD");

    File textFile = new File(args[0]);
    MapSymbolTable symbolTable = new MapSymbolTable();
    TokenizerFactory tokenizerFactory = IndoEuropeanTokenizerFactory.INSTANCE;
    String charset = "ASCII";
    System.out.println("  Extracting Bigrams");
    System.out.println("    File=" + textFile.getCanonicalPath());
    System.out.println("    tokenizerFactory.getClass()=" + tokenizerFactory.getClass());
    System.out.println("    input charset=" + charset);
    double[][] values = extractBigrams(textFile, symbolTable, tokenizerFactory, charset);

    int maxFactors = 3;
    double featureInit = 0.1;
    double initialLearningRate = 0.001;
    int annealingRate = 100;
    double regularization = 0.00;
    double minImprovement = 0.0000;
    int minEpochs = 10;
    int maxEpochs = 200;
    PrintWriter verbosePrintWriter = new PrintWriter(new OutputStreamWriter(System.out, charset));
    Reporter reporter = Reporters.writer(verbosePrintWriter).setLevel(LogLevel.DEBUG);

    System.out.println("  Computing SVD");
    System.out.println("    maxFactors=" + maxFactors);
    System.out.println("    featureInit=" + featureInit);
    System.out.println("    initialLearningRate=" + initialLearningRate);
    System.out.println("    annealingRate=" + annealingRate);
    System.out.println("    regularization" + regularization);
    System.out.println("    minImprovement=" + minImprovement);
    System.out.println("    minEpochs=" + minEpochs);
    System.out.println("    maxEpochs=" + maxEpochs);
    System.out.println("    output charset=" + charset);
    SvdMatrix matrix =
        SvdMatrix.svd(
            values,
            maxFactors,
            featureInit,
            initialLearningRate,
            annealingRate,
            regularization,
            reporter,
            minImprovement,
            minEpochs,
            maxEpochs);

    reportSvd(values, matrix, symbolTable);
  }