@Test
 public void seek() throws Exception {
   assertThat(reader.seek(new IntWritable(496)), is(true));
   assertThat(reader.next(key, value), is(true));
   assertThat(((IntWritable) key).get(), is(497));
   assertThat(((Text) value).toString(), is("Three, four, shut the door"));
 }
 public static void printComparisonList(JobConf job, FileSystem hdfs) throws IOException {
   MapFile.Reader partCompListReader =
       new MapFile.Reader(hdfs, partitionComparisonList.getName(), job);
   Text part = new Text();
   TextArrayWritable array = new TextArrayWritable();
   partCompListReader.get(part, array);
   System.out.println(part.toString() + " neighbors: ");
   for (int i = 0; i < array.get().length; i++) System.out.print(array.get()[i] + ", ");
 }
  @Before
  public void setUp() throws IOException {
    MapFileDemo.main(new String[] {MAP_URI});

    Configuration conf = new Configuration();
    fs = FileSystem.get(URI.create(MAP_URI), conf);

    reader = new MapFile.Reader(fs, MAP_URI, conf);
    key = (WritableComparable<?>) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
    value = (Writable) ReflectionUtils.newInstance(reader.getValueClass(), conf);
  }
  public static void main(int step, Path inputDir, JobConf job) throws IOException {
    FileSystem hdfs = inputDir.getFileSystem(job);
    if (!hdfs.exists(Collector.partitionSizesPath)) {
      System.out.println("Partition sizes file does not exists!");
      return;
    }
    debugStages = job.getBoolean(Config.DEBUG_STAGES_PROPERTY, Config.DEBUG_STAGES_VALUE);
    MapFile.Reader partitionSizeReader =
        new MapFile.Reader(hdfs, Collector.partitionSizesPath.getName(), new JobConf());
    Text partitionK = new Text();
    LongWritable partSizeV = new LongWritable();

    try {
      while (partitionSizeReader.next(partitionK, partSizeV)) {
        partitionsNames.add(partitionK.toString()); // useless?
        partitionsSizes.put(partitionK.toString(), partSizeV.get());
      }
    } catch (Exception e) {;
    }
    for (int i = 0; i < partitionsNames.size(); i++) {
      System.out.println(
          "Partition "
              + partitionsNames.get(i)
              + " has "
              + partitionsSizes.get(partitionsNames.get(i))
              + " vectors.");
    }

    if (partitionsNames.size() <= 1) return;
    stage0();
    printUndirectedNeighbors("Stage0");
    printPartitionsStat("Stage0");

    printCircularPartitionsWeight("\nCircular");
    calcCWStandardDeviation();

    stage1();
    printDirectedNeighbors("Stage1");
    System.out.println("Stage 1 final weights: ");
    printPartitionsWeights("Stage1");
    if ((step == 2) || (step == 12)) {
      stage2();
      printDirectedNeighbors("Stage2");
      System.out.println("Stage 2 final weights: ");
      printPartitionsWeights("Stage2");
    }
    // stage3(job, hdfs);
    writeComparisonList(job, hdfs);
    // printComparisonList(job, hdfs);// remove
  }
 @Test
 public void get() throws Exception {
   // vv MapFileSeekTest
   Text value = new Text();
   reader.get(new IntWritable(496), value);
   assertThat(value.toString(), is("One, two, buckle my shoe"));
   // ^^ MapFileSeekTest
 }
Beispiel #6
0
  public BucketCache(Configuration conf) throws IOException {
    bucketCache = new HashMap<IntWritable, Bucket>();

    for (String cachePath : PathUtils.getCachePaths(conf)) {
      String bucketCachePath = cachePath + BUCKET_CACHE_FOLDER;
      MapFile.Reader reader = new MapFile.Reader(new Path(bucketCachePath), conf);
      IntWritable key = new IntWritable();
      Bucket value = new Bucket();
      while (reader.next(key, value)) {
        bucketCache.put(new IntWritable(key.get()), new Bucket(value));
      }
    }

    for (IntWritable i : bucketCache.keySet()) {
      System.out.println("Loaded bucket from cache:" + i.get() + ":" + bucketCache.get(i));
    }
  }
  @Test
  public void testInvertedIndexing() throws Exception {
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);

    assertTrue(fs.exists(collectionPath));

    String[] args =
        new String[] {
          "hadoop jar",
          IntegrationUtils.getJar("dist", "cloud9"),
          edu.umd.cloud9.example.ir.BuildInvertedIndex.class.getCanonicalName(),
          IntegrationUtils.LOCAL_ARGS,
          "-libjars=" + IntegrationUtils.getJar("lib", "guava"),
          "-input",
          collectionPath.toString(),
          "-output",
          tmpPrefix,
          "-numReducers",
          "1"
        };

    IntegrationUtils.exec(Joiner.on(" ").join(args));

    MapFile.Reader reader = new MapFile.Reader(new Path(tmpPrefix + "/part-r-00000"), conf);

    Text key = new Text();
    PairOfWritables<IntWritable, ArrayListWritable<PairOfInts>> value =
        new PairOfWritables<IntWritable, ArrayListWritable<PairOfInts>>();

    key.set("gold");

    reader.get(key, value);

    assertEquals(584, value.getLeftElement().get());
    ArrayListWritable<PairOfInts> postings = value.getRightElement();

    assertEquals(584, value.getLeftElement().get());

    assertEquals(5303, postings.get(0).getLeftElement());
    assertEquals(684030, postings.get(100).getLeftElement());
    assertEquals(1634312, postings.get(200).getLeftElement());

    reader.close();
  }
 @Override
 public long next(HdfsInputStream hdfsistr, Holder<Object> key, Holder<Object> value) {
   try {
     MapFile.Reader reader = (BloomMapFile.Reader) hdfsistr.getIn();
     Holder<Integer> keySize = new Holder<Integer>();
     WritableComparable<?> keyWritable =
         (WritableComparable<?>)
             ReflectionUtils.newInstance(reader.getKeyClass(), new Configuration());
     Holder<Integer> valueSize = new Holder<Integer>();
     Writable valueWritable =
         (Writable) ReflectionUtils.newInstance(reader.getValueClass(), new Configuration());
     if (reader.next(keyWritable, valueWritable)) {
       key.value = getObject(keyWritable, keySize);
       value.value = getObject(valueWritable, valueSize);
       return keySize.value + valueSize.value;
     } else {
       return 0;
     }
   } catch (Exception ex) {
     throw new RuntimeCamelException(ex);
   }
 }
  @Override
  public void map(LongWritable key, Text value, Context context)
      throws IOException, InterruptedException {
    context.getCounter(MYCOUNTER.RECORD_COUNT).increment(1);

    if (value.toString().length() > 0) {
      String arrEmpAttributes[] = value.toString().split("\\t");
      txtMapLookupKey.set(arrEmpAttributes[6].toString());

      try {
        // txtMapLookupKey = deptNo
        // txtMapLookupValue = deptName
        deptMapReader.get(txtMapLookupKey, txtMapLookupValue);
      } finally {
        txtMapLookupValue.set(
            (txtMapLookupValue.equals(null) || txtMapLookupValue.equals(""))
                ? "NOT-FOUND"
                : txtMapLookupValue.toString());
      }

      txtMapOutputKey.set(arrEmpAttributes[0].toString()); // empNo --> joinKey
      txtMapOutputValue.set(
          arrEmpAttributes[1].toString()
              + "\t"
              + arrEmpAttributes[1].toString()
              + "\t"
              + arrEmpAttributes[2].toString()
              + "\t"
              + arrEmpAttributes[3].toString()
              + "\t"
              + arrEmpAttributes[4].toString()
              + "\t"
              + arrEmpAttributes[5].toString()
              + "\t"
              + arrEmpAttributes[6].toString()
              + "\t" // deptNo
              + txtMapLookupValue.toString()); // deptName
    }
    context.write(txtMapOutputKey, txtMapOutputValue);
    txtMapLookupValue.set("");
    txtMapLookupKey.set("");
  }
  /** Runs this tool. */
  @SuppressWarnings({"static-access"})
  public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(
        OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INDEX));
    options.addOption(
        OptionBuilder.withArgName("path")
            .hasArg()
            .withDescription("output path")
            .create(COLLECTION));

    CommandLine cmdline = null;
    CommandLineParser parser = new GnuParser();

    try {
      cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
      System.err.println("Error parsing command line: " + exp.getMessage());
      System.exit(-1);
    }

    if (!cmdline.hasOption(INDEX) || !cmdline.hasOption(COLLECTION)) {
      System.out.println("args: " + Arrays.toString(args));
      HelpFormatter formatter = new HelpFormatter();
      formatter.setWidth(120);
      formatter.printHelp(LookupPostingsCompressed.class.getName(), options);
      ToolRunner.printGenericCommandUsage(System.out);
      System.exit(-1);
    }

    String indexPath = cmdline.getOptionValue(INDEX);
    String collectionPath = cmdline.getOptionValue(COLLECTION);

    if (collectionPath.endsWith(".gz")) {
      System.out.println("gzipped collection is not seekable: use compressed version!");
      System.exit(-1);
    }

    Configuration config = new Configuration();
    FileSystem fs = FileSystem.get(config);
    MapFile.Reader reader = new MapFile.Reader(new Path(indexPath + "/part-r-00000"), config);

    FSDataInputStream collection = fs.open(new Path(collectionPath));
    BufferedReader d = new BufferedReader(new InputStreamReader(collection));

    Text key = new Text();
    ArrayListWritable<PairOfInts> postings;
    BytesWritable bytesValue = new BytesWritable();

    System.out.println("Looking up postings for the term \"starcross'd\"");
    key.set("starcross'd");

    reader.get(key, bytesValue);
    postings = deserializePosting(bytesValue);

    // ArrayListWritable<PairOfVInts> postings = value;
    for (PairOfInts pair : postings) {
      System.out.println(pair);
      collection.seek(pair.getLeftElement());
      System.out.println(d.readLine());
    }

    bytesValue = new BytesWritable();
    key.set("gold");
    reader.get(key, bytesValue);
    postings = deserializePosting(bytesValue);
    System.out.println(
        "Complete postings list for 'gold': (" + postings.size() + ", " + postings + ")");

    Int2IntFrequencyDistribution goldHist = new Int2IntFrequencyDistributionEntry();
    // postings = value;
    for (PairOfInts pair : postings) {
      goldHist.increment(pair.getRightElement());
    }

    System.out.println("histogram of tf values for gold");
    for (PairOfInts pair : goldHist) {
      System.out.println(pair.getLeftElement() + "\t" + pair.getRightElement());
    }

    bytesValue = new BytesWritable();
    key.set("silver");
    reader.get(key, bytesValue);
    postings = deserializePosting(bytesValue);
    System.out.println(
        "Complete postings list for 'silver': (" + postings.size() + ", " + postings + ")");

    Int2IntFrequencyDistribution silverHist = new Int2IntFrequencyDistributionEntry();
    // postings = value;
    for (PairOfInts pair : postings) {
      silverHist.increment(pair.getRightElement());
    }

    System.out.println("histogram of tf values for silver");
    for (PairOfInts pair : silverHist) {
      System.out.println(pair.getLeftElement() + "\t" + pair.getRightElement());
    }

    bytesValue = new BytesWritable();
    key.set("bronze");
    Writable w = reader.get(key, bytesValue);

    if (w == null) {
      System.out.println("the term bronze does not appear in the collection");
    }

    collection.close();
    reader.close();

    return 0;
  }
 @Override
 protected void cleanup(Context context) throws IOException, InterruptedException {
   deptMapReader.close();
 }
  @SuppressWarnings("static-access")
  @Override
  public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(
        OptionBuilder.withArgName("path")
            .hasArg()
            .withDescription("output path")
            .create(matchOutput));
    options.addOption(
        OptionBuilder.withArgName("path")
            .hasArg()
            .withDescription("output path")
            .create(nomatchOutput));
    options.addOption(
        OptionBuilder.withArgName("integer")
            .hasArg()
            .withDescription("number of samples")
            .create(nSamplesOption));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
      cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
      System.err.println("Error parsing command line: " + exp.getMessage());
      return -1;
    }

    if (!cmdline.hasOption(matchOutput)
        || !cmdline.hasOption(nomatchOutput)
        || !cmdline.hasOption(nSamplesOption)) {
      HelpFormatter formatter = new HelpFormatter();
      formatter.setWidth(120);
      formatter.printHelp(this.getClass().getName(), options);
      ToolRunner.printGenericCommandUsage(System.out);
      return -1;
    }

    String matchOutputPath = cmdline.getOptionValue(matchOutput);
    String nomatchOutputPath = cmdline.getOptionValue(nomatchOutput);
    String nSamplesIn = cmdline.getOptionValue(nSamplesOption);

    LOG.info("Tool name: " + this.getClass().getName());
    // LOG.info(" - input file: " + inputPath);
    // LOG.info(" - output file: " + outputPath);

    JobConf conf = new JobConf(getConf(), JaccardCompare.class);
    conf.setJobName(String.format("JaccardCompare"));

    //        FileInputFormat.setInputPaths(conf, new Path(inputPath));
    //        FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    int nSentences = 1000;
    int nSamples = Integer.parseInt(nSamplesIn);
    try {

      File matchFile = new File(matchOutputPath);
      File nomatchFile = new File(nomatchOutputPath);
      FileOutputStream fosM = null, fosNM = null;
      BufferedWriter dosM = null, dosNM = null;

      fosM = new FileOutputStream(matchFile);
      fosNM = new FileOutputStream(nomatchFile);
      dosM = new BufferedWriter(new OutputStreamWriter(fosM));
      dosNM = new BufferedWriter(new OutputStreamWriter(fosNM));

      MapFile.Reader id2sentenceReader =
          new MapFile.Reader(new Path("id2sentence.map/part-00000"), conf);
      HashMap<Integer, ArrayListWritable<Text>> id2sentence =
          new HashMap<Integer, ArrayListWritable<Text>>();
      IntWritable key = new IntWritable();
      ArrayListWritable<Text> val = new ArrayListWritable<Text>();
      while (id2sentenceReader.next(key, val)) {
        id2sentence.put(key.get(), val);
        val = new ArrayListWritable<Text>();
      }

      MapFile.Reader sentence2translationReader =
          new MapFile.Reader(new Path("sentence2translation.map/part-00000"), conf);
      HashMap<Integer, ArrayListOfIntsWritable> sentence2translation =
          new HashMap<Integer, ArrayListOfIntsWritable>();
      IntWritable key2 = new IntWritable();
      ArrayListOfIntsWritable val2 = new ArrayListOfIntsWritable();
      while (sentence2translationReader.next(key2, val2)) {
        sentence2translation.put(key2.get(), val2);
        val2 = new ArrayListOfIntsWritable();
      }

      MapFile.Reader sentencematchReader =
          new MapFile.Reader(new Path("sentencematchpairs.map/part-00000"), conf);
      HashSet<PairOfInts> sentencematchpairs = new HashSet<PairOfInts>();
      PairOfInts key3 = new PairOfInts();
      IntWritable val3 = new IntWritable();
      while (sentencematchReader.next(key3, val3)) {
        sentencematchpairs.add(key3);
        key3 = new PairOfInts();
      }

      System.out.println("Done reading");
      PairOfInts p = new PairOfInts();
      IntWritable match;
      IntWritable eLineNum = new IntWritable();
      IntWritable eLineId = new IntWritable();
      ArrayListWritable<Text> eSentence = new ArrayListWritable<Text>();
      for (int i = 0; i < nSentences; i++) {
        if (i % 100 == 0) System.out.println("eLine " + i);
        // eLineNum.set(2*i);
        ArrayListOfIntsWritable transIdList = sentence2translation.get(2 * i);
        // ArrayListOfIntsWritable transIdList = new ArrayListOfIntsWritable();
        // sentence2translationReader.get(eLineNum, transIdList);
        // System.out.println("transIdList " + transIdList);
        for (int j = 0; j < nSentences; j++) {
          // System.out.println("fLine " + j);
          ArrayListWritable<Text> fSentence = id2sentence.get((2 * j + 1) * nSamples);
          // ArrayListWritable<Text> fSentence = new ArrayListWritable<Text>();
          // IntWritable fLineId = new IntWritable();
          // fLineId.set((2*j+1)*nSamples);
          // id2sentenceReader.get(fLineId, fSentence);
          // System.out.println("fLineId " + (2*j+1)*nSamples + " FSentence " + fSentence);
          float jsimMax = -1.0f;
          float jsimAvg = 0.0f;
          for (int id : transIdList) {

            eSentence = id2sentence.get(id);
            // eLineId.set(id);
            // id2sentenceReader.get(eLineId, eSentence);
            float jsim = JaccardSim.jaccardSim(eSentence, fSentence);
            // System.out.println("\teSentence " + eSentence + " " + jsim);
            jsimAvg += jsim;
            if (jsim > jsimMax) {
              jsimMax = jsim;
            }
          }
          jsimAvg = jsimAvg / transIdList.size();
          if (2 * i < 2 * j + 1) {
            p.set(2 * i, 2 * j + 1);
          } else {
            p.set(2 * j + 1, 2 * i);
          }
          // match = new IntWritable();
          // match = (IntWritable) sentencematchReader.get(p, match);
          // if(match != null){
          if (sentencematchpairs.contains(p)) {
            if (jsimMax < .5) {
              System.out.println("Low match: ");
              System.out.println("\teSentence: " + i + " " + eSentence);
              System.out.println("\tfSentence: " + j + " " + fSentence);
            }
            // System.out.println("match");
            dosM.write(Float.toString(jsimMax));
            // dosM.write(Float.toString(jsimAvg));
            dosM.write("\n");
          } else {
            // System.out.println("no match");
            dosNM.write(Float.toString(jsimMax));
            // dosNM.write(Float.toString(jsimAvg));
            dosNM.write("\n");
          }
        }
      }
      sentencematchReader.close();
      sentence2translationReader.close();
      id2sentenceReader.close();
      dosM.close();
      dosNM.close();

    } catch (IOException e2) {
      // TODO Auto-generated catch block
      e2.printStackTrace();
    }

    // Delete the output directory if it exists already.
    //        Path outputDir = new Path(outputPath);
    // FileSystem.get(conf).delete(outputDir, true);

    // JobClient.runJob(conf);

    return 0;
  }