Пример #1
0
  private List<Summerizer[]> generateSummarizerList(
      List<Doc> docs, NGramProbs[] probs, LDAProbs inferredProbs) {
    List<Summerizer[]> summarizers = new ArrayList<Summerizer[]>();
    Summerizer[] s;

    // s = new Summerizer[docs.size()];
    // for (int i = 0; i < s.length; i++) {
    // s[i] = new FirstSentSum(docs.get(i), DEFAULT_MAX_SUMMARY_LENGTH);
    // }
    // summarizers.add(s);

    s = new Summerizer[docs.size()];
    for (int i = 0; i < s.length; i++) {
      System.out.println("Generating FirstBaseline #" + (i + 1));
      s[i] = new FirstBaseline(docs.get(i), DEFAULT_MAX_SUMMARY_LENGTH);
    }
    summarizers.add(s);

    s = new Summerizer[docs.size()];
    for (int i = 0; i < s.length; i++) {
      System.out.println("Generating SecondBaseline #" + (i + 1));
      s[i] = new SecondBaseline(docs.get(i), DEFAULT_MAX_SUMMARY_LENGTH);
    }
    summarizers.add(s);

    // need to change constructors to include corpus TreeMap
    //		s = new Summerizer[docs.size()];
    //		for (int i = 0; i < s.length; i++) {
    //			System.out.println("Generating ArticleTopicNGramSum #" + (i+1));
    //			s[i] = new ArticleTopicNGramSum(docs.get(i),
    //					DEFAULT_MAX_SUMMARY_LENGTH);
    //		}
    //		summarizers.add(s);

    //		s = new Summerizer[docs.size()];
    //		for (int i = 0; i < s.length; i++) {
    //			System.out.println("Generating NeFreqBasedSum #" + (i+1));
    //			s[i] = new NeFreqBasedSum(docs.get(i), DEFAULT_MAX_SUMMARY_LENGTH);
    //		}
    //		summarizers.add(s);

    //		s = new Summerizer[docs.size()];
    //		for (int i = 0; i < s.length; i++) {
    //			System.out.println("Generating MostProbSentBasedOnTopicDocProb #" + (i+1));
    //			s[i] = new MostProbSentBasedOnTopicDocProb(docs.get(i),
    //					DEFAULT_MAX_SUMMARY_LENGTH);
    //		}
    //		summarizers.add(s);

    CorpusCounts counts;
    try {
      counts = SerializableWrapper.readObject(CorpusCounts.SAVE_PATH);
    } catch (IOException e) {
      throw new RuntimeException(e);
    }

    s = new Summerizer[docs.size()];
    for (int i = 0; i < s.length; i++) {
      System.out.println("Generating FeatureBased #" + (i + 1));
      Feature tf_idf = new Tf_IdfFeature(counts, docs.get(i));
      Feature lda = new LDAFeature(inferredProbs, docs.get(i));
      s[i] =
          new FeatureBasedSummary(docs.get(i), DEFAULT_MAX_SUMMARY_LENGTH, probs[i], tf_idf, lda);
    }
    summarizers.add(s);

    s = new Summerizer[docs.size()];
    for (int i = 0; i < s.length; i++) {
      System.out.println("Generating FeatureBased_Sent #" + (i + 1));
      Feature tf_idf = new Tf_IdfFeature(counts, docs.get(i));
      Feature lda = new LDAFeature(inferredProbs, docs.get(i));
      s[i] =
          new FeatureBasedSummary_Sent(
              docs.get(i), DEFAULT_MAX_SUMMARY_LENGTH, probs[i], tf_idf, lda);
    }
    summarizers.add(s);

    s = new Summerizer[docs.size()];
    for (int i = 0; i < s.length; i++) {
      System.out.println("Generating FeatureBased_BagOfWords #" + (i + 1));
      Feature tf_idf = new Tf_IdfFeature(counts, docs.get(i));
      Feature lda = new LDAFeature(inferredProbs, docs.get(i));
      s[i] =
          new FeatureBasedSummary_BagOfWords(
              docs.get(i), DEFAULT_MAX_SUMMARY_LENGTH, probs[i], tf_idf, lda);
    }
    summarizers.add(s);

    //		s = new Summerizer[docs.size()];
    //		for (int i = 0; i < s.length; i++) {
    //			System.out.println("Generating MostProbSentSimpleGreedy #" + (i+1));
    //			s[i] = new MostProbSentSimpleGreedy(docs.get(i),
    //					DEFAULT_MAX_SUMMARY_LENGTH, probs[i]);
    //		}
    //		summarizers.add(s);

    return summarizers;
  }
Пример #2
0
  /**
   * @param args
   * @throws IOException
   */
  public static void main(String[] args) throws IOException {
    // Load config files
    System.err.println("Loading config files");
    Config conf = ConfigFactory.loadConfiguration(Config.class, Config.DEFAULT);
    IOConfig ioConf = ConfigFactory.loadConfiguration(IOConfig.class, IOConfig.DEFAULT);
    LDAEstimatorConfig estConf =
        ConfigFactory.loadConfiguration(LDAEstimatorConfig.class, LDAEstimatorConfig.DEFAULT);
    LDAInferenceConfig infConf =
        ConfigFactory.loadConfiguration(LDAInferenceConfig.class, LDAInferenceConfig.DEFAULT);

    main m = new main(conf, ioConf);

    // Load files that we want to summarize
    System.err.println("Loading documents");
    m.loadFiles();

    // for (CoreMap sentence : m.documents.get(1).getAno()
    // .get(SentencesAnnotation.class)) {
    // Tree tree = sentence.get(TreeAnnotation.class);
    // tree.pennPrint();
    // }

    // for (Doc d : m.documents) {
    // for (CoreLabel token : d.getAno().get(TokensAnnotation.class)) {
    // System.out.println("Token (NE): "
    // + token.get(TextAnnotation.class) + " ("
    // + token.get(NamedEntityTagAnnotation.class) + ")");
    // }
    // }

    // Load topic models
    System.err.println("Loading topic models");
    LDAProbs inferredModel = LDAProbsLoader.loadLDAProbs(estConf, infConf);

    // Loading clusters
    System.err.println("Loading doc clusters");
    DocCluster trainCluster = SerializableWrapper.readObject(DocCluster.CLUSTER_100_PATH);

    // Assign docs to clusters
    System.err.println("Assigning docs to clusters");
    List<Integer> clusterAssign = m.assignDocClusters(inferredModel);

    // Get a list of ngram probabilities for each document
    System.err.println("Getting doc ngram probabilities");
    NGramProbs[] probs = m.genDocNGramProbs(clusterAssign, trainCluster);

    //		System.err.println("Generating list of summarizers");
    List<Summerizer[]> summarizers = m.generateSummarizerList(m.documents, probs, inferredModel);
    //		 List<Summerizer[]> summarizers =
    //		 m.generateSummarizerList(m.documents,
    //		 null, inferredModel);

    Doc[][] summaries = new Doc[summarizers.size()][m.documents.size()];
    for (int i = 0; i < m.documents.size(); i++) {
      for (int j = 0; j < summarizers.size(); j++) {
        try {
          System.out.println("Generating summary (" + j + ", " + i + ")");

          m.generateSummary(m.documents.get(i), summarizers.get(j)[i]);
          // System.out.println(m.documents.get(i).summary);

          summaries[j][i] = new Doc();
          summaries[j][i].f = m.documents.get(i).f;
          summaries[j][i].summary = m.documents.get(i).summary;
        } catch (Exception e) {
          e.printStackTrace();
          summaries[j][i] = new Doc();
          summaries[j][i].f = m.documents.get(i).f;
          summaries[j][i].summary = "NO_SUM";
          continue;
        }
      }
      m.documents.get(i).setAno(null);
    }

    System.out.println("Start calcuating ROUGE");
    int count = 0;

    // Generate the ROUGE evaluation file
    String rougeInFile = "ROUGE-IN.xml";
    RougeEvalBuilder reb = m.genRouge();
    reb.write(rougeInFile);
    FileWriter fw = new FileWriter(new File("summariesCollapsed"));

    for (Doc[] docSums : summaries) {
      fw.write(summarizers.get(count)[0].getClass() + "\n");
      for (Doc doc : docSums) {
        fw.write(doc.summary.replaceAll("\n", " ") + "\n");
      }

      // Write the summaries to disk
      m.writeSummaries(docSums, summarizers.get(count)[0].getClass());

      // Run the ROUGE script on the generated summaries and print the
      // results
      RougeScript rs = new RougeScript(conf.getRougePath(), 95, 500, 2, 1.2);
      System.out.println("Writing summaries to: results-" + summarizers.get(count)[0].getClass());
      rs.run(rougeInFile, "results-" + summarizers.get(count++)[0].getClass());
      // RougeResults results = rs.run(rougeInFile);
      // System.out.println(summarizers.get(count++)[0].getClass());
      // System.out.println(results.getNgramAvgF(1));
    }
    fw.close();

    // for (Summerizer[] s : summarizers) {
    // System.err.println("Generating summaries for " + s.getClass());
    // // Generate summaries
    // for (int i = 0; i < s.length; i++) {
    // m.generateSummary(m.documents.get(i), s[i]);
    // System.out.println(m.documents.get(i).summary);
    //
    // // Reset annotation to null so that we don't run out of memory
    // m.documents.get(i).setAno(null);
    // }
    //
    // // Write the summaries to disk
    // m.writeSummaries();
    //
    // // Generate the ROUGE evaluation file
    // String rougeInFile = "ROUGE-IN.xml";
    // RougeEvalBuilder reb = m.genRouge();
    // reb.write(rougeInFile);
    //
    // // Run the ROUGE script on the generated summaries and print the
    // // results
    // RougeScript rs = new RougeScript(conf.getRougePath(), 95, 500, 2,
    // 1.2);
    // RougeResults results = rs.run(rougeInFile);
    // System.out.println(s[0].getClass());
    // System.out.println(results.getNgramAvgF(1));
    //
    // }
  }