示例#1
0
    private static void loadPairs(
        HMapIV<ArrayListOfIntsWritable> pwsimMapping, int langID, JobConf job, Reporter reporter) {
      try {
        Path[] localFiles = DistributedCache.getLocalCacheFiles(job);
        String pwsimFile = job.get("PwsimPairs");
        for (Path localFile : localFiles) {
          if (localFile.toString().contains(getFilename(pwsimFile))) {
            SequenceFile.Reader reader =
                new SequenceFile.Reader(FileSystem.getLocal(job), localFile, job);

            PairOfInts key = (PairOfInts) reader.getKeyClass().newInstance();
            IntWritable value = (IntWritable) reader.getValueClass().newInstance();
            int cnt = 0;
            while (reader.next(key, value)) {
              int fDocno = key.getRightElement();
              int eDocno = key.getLeftElement();
              if ((eDocno == 6127 && fDocno == 1000000074)
                  || (eDocno == 6127 && fDocno == 1000000071)) {
                sLogger.info(key);
              }
              if (langID == CLIRUtils.E) {
                if (!pwsimMapping.containsKey(eDocno)) {
                  pwsimMapping.put(eDocno, new ArrayListOfIntsWritable());
                }
                pwsimMapping
                    .get(eDocno)
                    .add(
                        fDocno); // we add 1000000000 to foreign docnos to distinguish them during
                                 // pwsim algo
              } else {
                if (!pwsimMapping.containsKey(fDocno)) {
                  pwsimMapping.put(fDocno, new ArrayListOfIntsWritable());
                }
                pwsimMapping
                    .get(fDocno)
                    .add(
                        eDocno); // we add 1000000000 to foreign docnos to distinguish them during
                                 // pwsim algo
              }
              cnt++;
              key = (PairOfInts) reader.getKeyClass().newInstance();
              value = (IntWritable) reader.getValueClass().newInstance();
            }
            reader.close();
            sLogger.info(pwsimMapping.size() + "," + cnt + " pairs loaded from " + localFile);
          }
        }
      } catch (Exception e) {
        throw new RuntimeException(e);
      }
    }
  @Test
  public void testIterable() {
    Int2IntFrequencyDistribution fd = new Int2IntFrequencyDistributionOpen();

    fd.set(1, 1);
    fd.set(4, 3);
    fd.set(2, 4);
    fd.set(5, 7);
    fd.set(6, 9);
    fd.set(3, 2);

    assertEquals(6, fd.getNumberOfEvents());
    assertEquals(26, fd.getSumOfCounts());

    SortedSet<PairOfInts> list = new TreeSet<PairOfInts>();

    for (PairOfInts pair : fd) {
      list.add(pair.clone());
    }

    assertEquals(6, list.size());

    Iterator<PairOfInts> iter = list.iterator();
    PairOfInts e = iter.next();
    assertEquals(1, e.getLeftElement());
    assertEquals(1, e.getRightElement());
    e = iter.next();
    assertEquals(2, e.getLeftElement());
    assertEquals(4, e.getRightElement());
    e = iter.next();
    assertEquals(3, e.getLeftElement());
    assertEquals(2, e.getRightElement());
    e = iter.next();
    assertEquals(4, e.getLeftElement());
    assertEquals(3, e.getRightElement());
    e = iter.next();
    assertEquals(5, e.getLeftElement());
    assertEquals(7, e.getRightElement());
    e = iter.next();
    assertEquals(6, e.getLeftElement());
    assertEquals(9, e.getRightElement());
  }
  /** Runs this tool. */
  @SuppressWarnings({"static-access"})
  public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(
        OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INDEX));
    options.addOption(
        OptionBuilder.withArgName("path")
            .hasArg()
            .withDescription("output path")
            .create(COLLECTION));

    CommandLine cmdline = null;
    CommandLineParser parser = new GnuParser();

    try {
      cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
      System.err.println("Error parsing command line: " + exp.getMessage());
      System.exit(-1);
    }

    if (!cmdline.hasOption(INDEX) || !cmdline.hasOption(COLLECTION)) {
      System.out.println("args: " + Arrays.toString(args));
      HelpFormatter formatter = new HelpFormatter();
      formatter.setWidth(120);
      formatter.printHelp(LookupPostingsCompressed.class.getName(), options);
      ToolRunner.printGenericCommandUsage(System.out);
      System.exit(-1);
    }

    String indexPath = cmdline.getOptionValue(INDEX);
    String collectionPath = cmdline.getOptionValue(COLLECTION);

    if (collectionPath.endsWith(".gz")) {
      System.out.println("gzipped collection is not seekable: use compressed version!");
      System.exit(-1);
    }

    Configuration config = new Configuration();
    FileSystem fs = FileSystem.get(config);
    MapFile.Reader reader = new MapFile.Reader(new Path(indexPath + "/part-r-00000"), config);

    FSDataInputStream collection = fs.open(new Path(collectionPath));
    BufferedReader d = new BufferedReader(new InputStreamReader(collection));

    Text key = new Text();
    ArrayListWritable<PairOfInts> postings;
    BytesWritable bytesValue = new BytesWritable();

    System.out.println("Looking up postings for the term \"starcross'd\"");
    key.set("starcross'd");

    reader.get(key, bytesValue);
    postings = deserializePosting(bytesValue);

    // ArrayListWritable<PairOfVInts> postings = value;
    for (PairOfInts pair : postings) {
      System.out.println(pair);
      collection.seek(pair.getLeftElement());
      System.out.println(d.readLine());
    }

    bytesValue = new BytesWritable();
    key.set("gold");
    reader.get(key, bytesValue);
    postings = deserializePosting(bytesValue);
    System.out.println(
        "Complete postings list for 'gold': (" + postings.size() + ", " + postings + ")");

    Int2IntFrequencyDistribution goldHist = new Int2IntFrequencyDistributionEntry();
    // postings = value;
    for (PairOfInts pair : postings) {
      goldHist.increment(pair.getRightElement());
    }

    System.out.println("histogram of tf values for gold");
    for (PairOfInts pair : goldHist) {
      System.out.println(pair.getLeftElement() + "\t" + pair.getRightElement());
    }

    bytesValue = new BytesWritable();
    key.set("silver");
    reader.get(key, bytesValue);
    postings = deserializePosting(bytesValue);
    System.out.println(
        "Complete postings list for 'silver': (" + postings.size() + ", " + postings + ")");

    Int2IntFrequencyDistribution silverHist = new Int2IntFrequencyDistributionEntry();
    // postings = value;
    for (PairOfInts pair : postings) {
      silverHist.increment(pair.getRightElement());
    }

    System.out.println("histogram of tf values for silver");
    for (PairOfInts pair : silverHist) {
      System.out.println(pair.getLeftElement() + "\t" + pair.getRightElement());
    }

    bytesValue = new BytesWritable();
    key.set("bronze");
    Writable w = reader.get(key, bytesValue);

    if (w == null) {
      System.out.println("the term bronze does not appear in the collection");
    }

    collection.close();
    reader.close();

    return 0;
  }