Ejemplo n.º 1
0
 @Test
 public void get() throws Exception {
   // vv MapFileSeekTest
   Text value = new Text();
   reader.get(new IntWritable(496), value);
   assertThat(value.toString(), is("One, two, buckle my shoe"));
   // ^^ MapFileSeekTest
 }
Ejemplo n.º 2
0
 public static void printComparisonList(JobConf job, FileSystem hdfs) throws IOException {
   MapFile.Reader partCompListReader =
       new MapFile.Reader(hdfs, partitionComparisonList.getName(), job);
   Text part = new Text();
   TextArrayWritable array = new TextArrayWritable();
   partCompListReader.get(part, array);
   System.out.println(part.toString() + " neighbors: ");
   for (int i = 0; i < array.get().length; i++) System.out.print(array.get()[i] + ", ");
 }
Ejemplo n.º 3
0
  @Test
  public void testInvertedIndexing() throws Exception {
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);

    assertTrue(fs.exists(collectionPath));

    String[] args =
        new String[] {
          "hadoop jar",
          IntegrationUtils.getJar("dist", "cloud9"),
          edu.umd.cloud9.example.ir.BuildInvertedIndex.class.getCanonicalName(),
          IntegrationUtils.LOCAL_ARGS,
          "-libjars=" + IntegrationUtils.getJar("lib", "guava"),
          "-input",
          collectionPath.toString(),
          "-output",
          tmpPrefix,
          "-numReducers",
          "1"
        };

    IntegrationUtils.exec(Joiner.on(" ").join(args));

    MapFile.Reader reader = new MapFile.Reader(new Path(tmpPrefix + "/part-r-00000"), conf);

    Text key = new Text();
    PairOfWritables<IntWritable, ArrayListWritable<PairOfInts>> value =
        new PairOfWritables<IntWritable, ArrayListWritable<PairOfInts>>();

    key.set("gold");

    reader.get(key, value);

    assertEquals(584, value.getLeftElement().get());
    ArrayListWritable<PairOfInts> postings = value.getRightElement();

    assertEquals(584, value.getLeftElement().get());

    assertEquals(5303, postings.get(0).getLeftElement());
    assertEquals(684030, postings.get(100).getLeftElement());
    assertEquals(1634312, postings.get(200).getLeftElement());

    reader.close();
  }
  @Override
  public void map(LongWritable key, Text value, Context context)
      throws IOException, InterruptedException {
    context.getCounter(MYCOUNTER.RECORD_COUNT).increment(1);

    if (value.toString().length() > 0) {
      String arrEmpAttributes[] = value.toString().split("\\t");
      txtMapLookupKey.set(arrEmpAttributes[6].toString());

      try {
        // txtMapLookupKey = deptNo
        // txtMapLookupValue = deptName
        deptMapReader.get(txtMapLookupKey, txtMapLookupValue);
      } finally {
        txtMapLookupValue.set(
            (txtMapLookupValue.equals(null) || txtMapLookupValue.equals(""))
                ? "NOT-FOUND"
                : txtMapLookupValue.toString());
      }

      txtMapOutputKey.set(arrEmpAttributes[0].toString()); // empNo --> joinKey
      txtMapOutputValue.set(
          arrEmpAttributes[1].toString()
              + "\t"
              + arrEmpAttributes[1].toString()
              + "\t"
              + arrEmpAttributes[2].toString()
              + "\t"
              + arrEmpAttributes[3].toString()
              + "\t"
              + arrEmpAttributes[4].toString()
              + "\t"
              + arrEmpAttributes[5].toString()
              + "\t"
              + arrEmpAttributes[6].toString()
              + "\t" // deptNo
              + txtMapLookupValue.toString()); // deptName
    }
    context.write(txtMapOutputKey, txtMapOutputValue);
    txtMapLookupValue.set("");
    txtMapLookupKey.set("");
  }
  /** Runs this tool. */
  @SuppressWarnings({"static-access"})
  public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(
        OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INDEX));
    options.addOption(
        OptionBuilder.withArgName("path")
            .hasArg()
            .withDescription("output path")
            .create(COLLECTION));

    CommandLine cmdline = null;
    CommandLineParser parser = new GnuParser();

    try {
      cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
      System.err.println("Error parsing command line: " + exp.getMessage());
      System.exit(-1);
    }

    if (!cmdline.hasOption(INDEX) || !cmdline.hasOption(COLLECTION)) {
      System.out.println("args: " + Arrays.toString(args));
      HelpFormatter formatter = new HelpFormatter();
      formatter.setWidth(120);
      formatter.printHelp(LookupPostingsCompressed.class.getName(), options);
      ToolRunner.printGenericCommandUsage(System.out);
      System.exit(-1);
    }

    String indexPath = cmdline.getOptionValue(INDEX);
    String collectionPath = cmdline.getOptionValue(COLLECTION);

    if (collectionPath.endsWith(".gz")) {
      System.out.println("gzipped collection is not seekable: use compressed version!");
      System.exit(-1);
    }

    Configuration config = new Configuration();
    FileSystem fs = FileSystem.get(config);
    MapFile.Reader reader = new MapFile.Reader(new Path(indexPath + "/part-r-00000"), config);

    FSDataInputStream collection = fs.open(new Path(collectionPath));
    BufferedReader d = new BufferedReader(new InputStreamReader(collection));

    Text key = new Text();
    ArrayListWritable<PairOfInts> postings;
    BytesWritable bytesValue = new BytesWritable();

    System.out.println("Looking up postings for the term \"starcross'd\"");
    key.set("starcross'd");

    reader.get(key, bytesValue);
    postings = deserializePosting(bytesValue);

    // ArrayListWritable<PairOfVInts> postings = value;
    for (PairOfInts pair : postings) {
      System.out.println(pair);
      collection.seek(pair.getLeftElement());
      System.out.println(d.readLine());
    }

    bytesValue = new BytesWritable();
    key.set("gold");
    reader.get(key, bytesValue);
    postings = deserializePosting(bytesValue);
    System.out.println(
        "Complete postings list for 'gold': (" + postings.size() + ", " + postings + ")");

    Int2IntFrequencyDistribution goldHist = new Int2IntFrequencyDistributionEntry();
    // postings = value;
    for (PairOfInts pair : postings) {
      goldHist.increment(pair.getRightElement());
    }

    System.out.println("histogram of tf values for gold");
    for (PairOfInts pair : goldHist) {
      System.out.println(pair.getLeftElement() + "\t" + pair.getRightElement());
    }

    bytesValue = new BytesWritable();
    key.set("silver");
    reader.get(key, bytesValue);
    postings = deserializePosting(bytesValue);
    System.out.println(
        "Complete postings list for 'silver': (" + postings.size() + ", " + postings + ")");

    Int2IntFrequencyDistribution silverHist = new Int2IntFrequencyDistributionEntry();
    // postings = value;
    for (PairOfInts pair : postings) {
      silverHist.increment(pair.getRightElement());
    }

    System.out.println("histogram of tf values for silver");
    for (PairOfInts pair : silverHist) {
      System.out.println(pair.getLeftElement() + "\t" + pair.getRightElement());
    }

    bytesValue = new BytesWritable();
    key.set("bronze");
    Writable w = reader.get(key, bytesValue);

    if (w == null) {
      System.out.println("the term bronze does not appear in the collection");
    }

    collection.close();
    reader.close();

    return 0;
  }