Beispiel #1
0
 public void plus(WeightedIntDocVector otherVector) {
   // sLogger.debug ("plus (otherVector: " + otherVector + ")");
   // sLogger.debug ("weightedTerms == null: " + (weightedTerms == null));
   // sLogger.debug ("otherVector.mWeightedTerms == null: " + (otherVector.mWeightedTerms ==
   // null));
   weightedTerms.plus(otherVector.weightedTerms);
   docLength += otherVector.docLength;
 }
Beispiel #2
0
  @SuppressWarnings("unchecked")
  public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(Settings.HELP_OPTION, false, "print the help message");
    options.addOption(
        OptionBuilder.withArgName(Settings.PATH_INDICATOR)
            .hasArg()
            .withDescription("input beta file")
            .create(Settings.INPUT_OPTION));
    options.addOption(
        OptionBuilder.withArgName(Settings.PATH_INDICATOR)
            .hasArg()
            .withDescription("term index file")
            .create(ParseCorpus.INDEX));
    options.addOption(
        OptionBuilder.withArgName(Settings.INTEGER_INDICATOR)
            .hasArg()
            .withDescription("display top terms only (default - 10)")
            .create(TOP_DISPLAY_OPTION));

    String betaString = null;
    String indexString = null;
    int topDisplay = TOP_DISPLAY;

    CommandLineParser parser = new GnuParser();
    HelpFormatter formatter = new HelpFormatter();
    try {
      CommandLine line = parser.parse(options, args);

      if (line.hasOption(Settings.HELP_OPTION)) {
        formatter.printHelp(ParseCorpus.class.getName(), options);
        System.exit(0);
      }

      if (line.hasOption(Settings.INPUT_OPTION)) {
        betaString = line.getOptionValue(Settings.INPUT_OPTION);
      } else {
        throw new ParseException(
            "Parsing failed due to " + Settings.INPUT_OPTION + " not initialized...");
      }

      if (line.hasOption(ParseCorpus.INDEX)) {
        indexString = line.getOptionValue(ParseCorpus.INDEX);
      } else {
        throw new ParseException(
            "Parsing failed due to " + ParseCorpus.INDEX + " not initialized...");
      }

      if (line.hasOption(TOP_DISPLAY_OPTION)) {
        topDisplay = Integer.parseInt(line.getOptionValue(TOP_DISPLAY_OPTION));
      }
    } catch (ParseException pe) {
      System.err.println(pe.getMessage());
      formatter.printHelp(ParseCorpus.class.getName(), options);
      System.exit(0);
    } catch (NumberFormatException nfe) {
      System.err.println(nfe.getMessage());
      System.exit(0);
    }

    JobConf conf = new JobConf(DisplayTopic.class);
    FileSystem fs = FileSystem.get(conf);

    Path indexPath = new Path(indexString);
    Preconditions.checkArgument(
        fs.exists(indexPath) && fs.isFile(indexPath), "Invalid index path...");

    Path betaPath = new Path(betaString);
    Preconditions.checkArgument(fs.exists(betaPath) && fs.isFile(betaPath), "Invalid beta path...");

    SequenceFile.Reader sequenceFileReader = null;
    try {
      IntWritable intWritable = new IntWritable();
      Text text = new Text();
      Map<Integer, String> termIndex = new HashMap<Integer, String>();
      sequenceFileReader = new SequenceFile.Reader(fs, indexPath, conf);
      while (sequenceFileReader.next(intWritable, text)) {
        termIndex.put(intWritable.get(), text.toString());
      }

      PairOfIntFloat pairOfIntFloat = new PairOfIntFloat();
      HMapIFW hmap = new HMapIFW();
      TreeMap<Float, Integer> treeMap = new TreeMap<Float, Integer>();
      sequenceFileReader = new SequenceFile.Reader(fs, betaPath, conf);
      while (sequenceFileReader.next(pairOfIntFloat, hmap)) {
        treeMap.clear();

        System.out.println("==============================");
        System.out.println(
            "Top ranked " + topDisplay + " terms for Topic " + pairOfIntFloat.getLeftElement());
        System.out.println("==============================");

        Iterator<Integer> itr1 = hmap.keySet().iterator();
        int temp1 = 0;
        while (itr1.hasNext()) {
          temp1 = itr1.next();
          treeMap.put(-hmap.get(temp1), temp1);
          if (treeMap.size() > topDisplay) {
            treeMap.remove(treeMap.lastKey());
          }
        }

        Iterator<Float> itr2 = treeMap.keySet().iterator();
        float temp2 = 0;
        while (itr2.hasNext()) {
          temp2 = itr2.next();
          if (termIndex.containsKey(treeMap.get(temp2))) {
            System.out.println(termIndex.get(treeMap.get(temp2)) + "\t\t" + -temp2);
          } else {
            System.out.println("How embarrassing! Term index not found...");
          }
        }
      }
    } finally {
      IOUtils.closeStream(sequenceFileReader);
    }

    return 0;
  }
Beispiel #3
0
 public boolean containsTerm(int termid) {
   return weightedTerms.containsKey(termid);
 }
Beispiel #4
0
 @Override
 public String toString() {
   return weightedTerms.toString();
 }
Beispiel #5
0
 public void normalizeWith(float l) {
   for (int f : weightedTerms.keySet()) {
     weightedTerms.put(f, weightedTerms.get(f) / l);
   }
 }
Beispiel #6
0
 public float dot(WeightedIntDocVector otherVector) {
   // sLogger.debug ("dot (otherVector: " + otherVector + ")");
   float result = weightedTerms.dot(otherVector.weightedTerms);
   // sLogger.debug ("in KMeansClusterDocs mapper dotProduct () returning: " + result);
   return result;
 }
Beispiel #7
0
 public void readFields(DataInput in) throws IOException {
   docLength = WritableUtils.readVInt(in);
   weightedTerms = new HMapIFW();
   weightedTerms.readFields(in);
 }
Beispiel #8
0
 public void write(DataOutput out) throws IOException {
   WritableUtils.writeVInt(out, docLength);
   weightedTerms.write(out);
 }
Beispiel #9
0
 public float getWeight(int termid) {
   return weightedTerms.get(termid);
 }