@Before
  public void setUp() throws Exception {
    // create local Pig server
    pigServer = UnitTestUtil.makePigServer();

    // create temp SequenceFile
    File tempFile = File.createTempFile("test", ".txt");
    tempFilename = tempFile.getAbsolutePath();
    Path path = new Path("file:///" + tempFilename);
    Configuration conf = new Configuration();
    FileSystem fs = path.getFileSystem(conf);
    IntWritable key = new IntWritable();
    Text value = new Text();
    SequenceFile.Writer writer = null;
    try {
      writer = SequenceFile.createWriter(fs, conf, path, key.getClass(), value.getClass());
      for (int i = 0; i < DATA.length; ++i) {
        key.set(i);
        value.set(DATA[i]);
        writer.append(key, value);
      }
    } finally {
      IOUtils.closeStream(writer);
    }
  }
Пример #2
0
 @Override
 public void write(DataOutput dataOutput) throws IOException {
   dataOutput.writeLong(leftCoordinate);
   Text.writeString(dataOutput, samRecord);
   Text.writeString(dataOutput, rfname);
   // Text.writeString(dataOutput, sqTag);
 }
Пример #3
0
 /**
  * Test readLine for various kinds of line termination sequneces. Varies buffer size to stress
  * test. Also check that returned value matches the string length.
  *
  * @throws Exception
  */
 @Test
 public void testNewLines() throws Exception {
   final String STR = "a\nbb\n\nccc\rdddd\r\r\r\n\r\neeeee";
   final int STRLENBYTES = STR.getBytes().length;
   Text out = new Text();
   for (int bufsz = 1; bufsz < STRLENBYTES + 1; ++bufsz) {
     LineReader in = makeStream(STR, bufsz);
     int c = 0;
     c += in.readLine(out); // "a"\n
     assertEquals("line1 length, bufsz:" + bufsz, 1, out.getLength());
     c += in.readLine(out); // "bb"\n
     assertEquals("line2 length, bufsz:" + bufsz, 2, out.getLength());
     c += in.readLine(out); // ""\n
     assertEquals("line3 length, bufsz:" + bufsz, 0, out.getLength());
     c += in.readLine(out); // "ccc"\r
     assertEquals("line4 length, bufsz:" + bufsz, 3, out.getLength());
     c += in.readLine(out); // dddd\r
     assertEquals("line5 length, bufsz:" + bufsz, 4, out.getLength());
     c += in.readLine(out); // ""\r
     assertEquals("line6 length, bufsz:" + bufsz, 0, out.getLength());
     c += in.readLine(out); // ""\r\n
     assertEquals("line7 length, bufsz:" + bufsz, 0, out.getLength());
     c += in.readLine(out); // ""\r\n
     assertEquals("line8 length, bufsz:" + bufsz, 0, out.getLength());
     c += in.readLine(out); // "eeeee"EOF
     assertEquals("line9 length, bufsz:" + bufsz, 5, out.getLength());
     assertEquals("end of file, bufsz: " + bufsz, 0, in.readLine(out));
     assertEquals("total bytes, bufsz: " + bufsz, c, STRLENBYTES);
   }
 }
    protected void cleanup(Context context) throws IOException, InterruptedException {

      String[] keys = {
        "thereIsAChange",
        "onlyAdded",
        "onlyRemoved",
        "bothAddedAndRemoved",
        "totalIncreased",
        "totalDecresed",
        "nochange"
      };
      int[] values = {
        thereIsAChange,
        onlyAdded,
        onlyRemoved,
        bothAddedAndRemoved,
        totalIncreased,
        totalDecresed,
        nochange
      };
      Text key = new Text();
      Text val = new Text();
      for (int i = 0; i < keys.length; i++) {
        key.set(keys[i]);
        val.set(values[i] + "");
        context.write(key, val);
      }
    }
Пример #5
0
 @Override
 public Writable create(Object value, TypeConverter typeConverter, Holder<Integer> size) {
   Text writable = new Text();
   writable.set(typeConverter.convertTo(String.class, value));
   size.value = writable.getBytes().length;
   return writable;
 }
  @Override
  protected void map(LongWritable key, Text value, Mapper.Context context)
      throws IOException, InterruptedException {
    String line = value.toString();
    StringTokenizer tokenizer = new StringTokenizer(line, "\t");
    if (tokenizer.countTokens() == 4) { // u.data record
      tokenizer.nextToken();
      String itemid = tokenizer.nextToken();
      String rating = tokenizer.nextToken();
      item.set(itemid);
      fields.set(rating);
      context.write(item, fields);
    } else { // u.item record
      tokenizer = new StringTokenizer(line, "|");
      String itemid = tokenizer.nextToken();
      String title = tokenizer.nextToken();
      String release = tokenizer.nextToken();
      // tokenizer.nextToken();
      String imdb = tokenizer.nextToken();
      fields.set(title + "\t" + release + "\t" + imdb);
      item.set(itemid);
      context.write(item, fields);
    }

    // TotalRecords counter
    Counter counter = context.getCounter("MyCounter", "TOTALRECORDS");
    counter.increment(1);
  }
  public static final VideologPair filerByErrCode(Text value) {
    String result = "";
    VideologPair pair = null;

    if (value == null) {
      pair = new VideologPair("");
      pair.setValue("");

      return pair;
    }

    String errCode = "";
    String[] items = value.toString().split("\t");

    if (items != null && items.length == 14) {
      // extract the err code: column index 8:
      errCode = items[7];

      // check the err code is valid, if not then ignore.
      if (VALID_ERR_CODE.contains(errCode) || errCode.startsWith("") || errCode.startsWith("")) {
        pair = new VideologPair(items[0] + "|" + items[4]);

        result = value.toString();

        pair.setValue(result);
      }
    } else {
      pair = new VideologPair("");
      pair.setValue("");
    }

    return pair;
  }
Пример #8
0
    @Override
    public void map(LongWritable row, NullWritable ignored, Context context)
        throws IOException, InterruptedException {
      context.setStatus("Entering");
      long rowId = row.get();
      if (rand == null) {
        // we use 3 random numbers per a row
        rand = new RandomGenerator(rowId * 3);
      }
      addKey();
      value.clear();
      // addRowId(rowId);
      addFiller(rowId);

      // New
      Mutation m = new Mutation(key);
      m.put(
          new Text("c"), // column family
          getRowIdString(rowId), // column qual
          new Value(value.toString().getBytes())); // data

      context.setStatus("About to add to accumulo");
      context.write(tableName, m);
      context.setStatus("Added to accumulo " + key.toString());
    }
    public void reduce(
        Text key,
        Iterator<Text> values,
        OutputCollector<Text, NullWritable> output,
        Reporter reporter)
        throws IOException {

      // convert a.b.c into a^Ab^Ac for easier import into Hive.
      String classC = key.toString();
      String asFields = classC.replace(INPUT_FIELD_SEP, OUTPUT_FIELD_SEP);

      Text outKey = new Text(asFields);

      Set<Integer> seenOctets = new HashSet<Integer>();

      while (values.hasNext()) {
        Text val = values.next();
        try {
          Integer lastOctet = new Integer(val.toString());
          if (!seenOctets.contains(lastOctet)) {
            // we have not seen this a.b.c.d before. emit one output entry for
            // the a.b.c, and memorize the d so we don't do this again for the
            // same IP. This is ok to buffer because there will be at most 256
            // unique entries.
            output.collect(outKey, NullWritable.get());
            seenOctets.add(lastOctet);
          }
        } catch (NumberFormatException nfe) {
          // ignore malformed input; just continue.
        }
      }
    }
 @Override
 protected void map(LongWritable key, Text value, Context context)
     throws IOException, InterruptedException {
   // 获取输入文件的全路径和名称
   String pathName = ((FileSplit) context.getInputSplit()).getPath().toString();
   if (pathName.contains("data.txt")) {
     String values[] = value.toString().split("\t");
     if (values.length < 3) {
       // data数据格式不规范,字段小于3,抛弃数据
       return;
     } else {
       // 数据格式规范,区分标识为1
       TextPair tp = new TextPair(new Text(values[1]), new Text("1"));
       context.write(tp, new Text(values[0] + "\t" + values[2]));
     }
   }
   if (pathName.contains("info.txt")) {
     String values[] = value.toString().split("\t");
     if (values.length < 2) {
       // data数据格式不规范,字段小于2,抛弃数据
       return;
     } else {
       // 数据格式规范,区分标识为0
       TextPair tp = new TextPair(new Text(values[0]), new Text("0"));
       context.write(tp, new Text(values[1]));
     }
   }
 }
Пример #11
0
  /** Check whether the file list have duplication. */
  private static void checkDuplication(FileSystem fs, Path file, Path sorted, Configuration conf)
      throws IOException {
    SequenceFile.Reader in = null;
    try {
      SequenceFile.Sorter sorter =
          new SequenceFile.Sorter(fs, new Text.Comparator(), Text.class, Text.class, conf);
      sorter.sort(file, sorted);
      in = new SequenceFile.Reader(fs, sorted, conf);

      Text prevdst = null, curdst = new Text();
      Text prevsrc = null, cursrc = new Text();
      for (; in.next(curdst, cursrc); ) {
        if (prevdst != null && curdst.equals(prevdst)) {
          throw new DuplicationException(
              "Invalid input, there are duplicated files in the sources: "
                  + prevsrc
                  + ", "
                  + cursrc);
        }
        prevdst = curdst;
        curdst = new Text();
        prevsrc = cursrc;
        cursrc = new Text();
      }
    } finally {
      checkAndClose(in);
    }
  }
Пример #12
0
    @Override
    protected void reduce(
        SortedMapWritableComparable key, Iterable<BytesWritable> values, Context context)
        throws IOException, InterruptedException {
      StatisticsProtos.Statistics.Builder statisticsBuilder =
          StatisticsProtos.Statistics.newBuilder();
      StatisticsProtos.KeyValue.Builder kvBuilder = StatisticsProtos.KeyValue.newBuilder();

      for (SortedMapWritableComparable.Entry<WritableComparable, Writable> partEntry :
          key.entrySet()) {
        kvBuilder.clear();
        Text kText = (Text) partEntry.getKey();
        Text vText = (Text) partEntry.getValue();
        kvBuilder.setKey(kText.toString());
        kvBuilder.setValue(vText.toString());
        statisticsBuilder.addPartitions(kvBuilder);
      }
      for (Map.Entry<String, StatisticCalculator> statisticEntry :
          this.statGenConfiguration.getStatisticCalculators().entrySet()) {
        kvBuilder.clear();
        double result = statisticEntry.getValue().calculate(values);
        kvBuilder.setKey(statisticEntry.getKey());
        kvBuilder.setValue(String.valueOf(result));
        statisticsBuilder.addStatistics(kvBuilder);
      }
      statisticsBuilder.setTimestamp(new Date().getTime());
      context.write(
          new Text(genKeyFromMap(key)), new BytesWritable(statisticsBuilder.build().toByteArray()));
    }
Пример #13
0
  @Override
  public void reduce(Text key, Iterable<Text> values, Context context)
      throws IOException, InterruptedException {
    aggs.reset();

    for (Text value : values) {
      codec.decode(ByteBuffer.wrap(value.getBytes(), 0, value.getLength()), input);
      if (cuboidLevel > 0) {
        aggs.aggregate(input, needAggr);
      } else {
        aggs.aggregate(input);
      }
    }
    aggs.collectStates(result);

    ByteBuffer valueBuf = codec.encode(result);

    outputValue.set(valueBuf.array(), 0, valueBuf.position());
    context.write(key, outputValue);

    counter++;
    if (counter % BatchConstants.NORMAL_RECORD_LOG_THRESHOLD == 0) {
      logger.info("Handled " + counter + " records!");
    }
  }
  /**
   * Generate random data, compress it, index and md5 hash the data. Then read it all back and md5
   * that too, to verify that it all went ok.
   *
   * @param testWithIndex Should we index or not?
   * @param charsToOutput How many characters of random data should we output.
   * @throws IOException
   * @throws NoSuchAlgorithmException
   * @throws InterruptedException
   */
  private void runTest(boolean testWithIndex, int charsToOutput)
      throws IOException, NoSuchAlgorithmException, InterruptedException {

    Configuration conf = new Configuration();
    conf.setLong("fs.local.block.size", charsToOutput / 2);
    // reducing block size to force a split of the tiny file
    conf.set("io.compression.codecs", LzopCodec.class.getName());

    Assume.assumeTrue(CoreTestUtil.okToRunLzoTests(conf));

    FileSystem.getLocal(conf).close(); // remove cached filesystem (if any)
    FileSystem localFs = FileSystem.getLocal(conf);
    localFs.delete(outputDir_, true);
    localFs.mkdirs(outputDir_);

    Job job = new Job(conf);
    TextOutputFormat.setCompressOutput(job, true);
    TextOutputFormat.setOutputCompressorClass(job, LzopCodec.class);
    TextOutputFormat.setOutputPath(job, outputDir_);

    TaskAttemptContext attemptContext =
        new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID("123", 0, false, 1, 2));

    // create some input data
    byte[] expectedMd5 = createTestInput(outputDir_, localFs, attemptContext, charsToOutput);

    if (testWithIndex) {
      Path lzoFile = new Path(outputDir_, lzoFileName_);
      LzoIndex.createIndex(localFs, lzoFile);
    }

    LzoTextInputFormat inputFormat = new LzoTextInputFormat();
    TextInputFormat.setInputPaths(job, outputDir_);

    List<InputSplit> is = inputFormat.getSplits(job);
    // verify we have the right number of lzo chunks
    if (testWithIndex && OUTPUT_BIG == charsToOutput) {
      assertEquals(3, is.size());
    } else {
      assertEquals(1, is.size());
    }

    // let's read it all and calculate the md5 hash
    for (InputSplit inputSplit : is) {
      RecordReader<LongWritable, Text> rr =
          inputFormat.createRecordReader(inputSplit, attemptContext);
      rr.initialize(inputSplit, attemptContext);

      while (rr.nextKeyValue()) {
        Text value = rr.getCurrentValue();

        md5_.update(value.getBytes(), 0, value.getLength());
      }

      rr.close();
    }

    localFs.close();
    assertTrue(Arrays.equals(expectedMd5, md5_.digest()));
  }
Пример #15
0
  public static void main(String[] args) throws IOException {
    // TODO Auto-generated method stub
    String uri = args[0];

    Configuration conf = new Configuration();

    Path path = new Path(uri);

    FileSystem fs = FileSystem.get(URI.create(uri), conf);

    IntWritable key = new IntWritable();
    Text value = new Text();
    SequenceFile.Writer writer = null;
    try {
      writer = SequenceFile.createWriter(fs, conf, path, key.getClass(), value.getClass());

      for (int i = 0; i < 100; i++) {
        key.set(100 - i);
        value.set(DATA[i % DATA.length]);

        writer.append(key, value);
      }
    } finally {
      IOUtils.closeStream(writer);
    }
  }
Пример #16
0
    @SuppressWarnings({"unchecked", "rawtypes"})
    @Override
    public void map(
        Writable key, Indexable doc, Mapper<Writable, Indexable, Text, Text>.Context context)
        throws IOException, InterruptedException {

      List<String> sentences = new ArrayList<String>();

      if (doc instanceof SentenceSegmentedDocument) {
        List<SentenceWritable> segmentedSentences =
            ((SentenceSegmentedDocument) doc).getSentences();
        for (SentenceWritable sentence : segmentedSentences) {
          sentences.add(sentence.toString());
        }
      } else {
        sentences =
            Arrays.asList(mSentenceDetector.sentDetect(doc.getContent().replace('\n', ' ')));
      }

      for (String sentence : sentences) {
        for (Pattern p : mPatterns) {
          mKey.set(p.pattern());
          if (p.matcher(sentence).find()) {
            mValue.set(sentence);
            context.write(mKey, mValue);
          }
        }
      }
    }
    @Override
    public void reduce(Text key, Iterable<HMapStIW> values, Context context)
        throws IOException, InterruptedException {
      Iterator<HMapStIW> iter = values.iterator();
      HMapStIW map = new HMapStIW();

      while (iter.hasNext()) {
        map.plus(iter.next());
      }

      HMapStFW writeMap = new HMapStFW();

      double pmi = 0.0;
      for (MapKI.Entry<String> entry : map.entrySet()) {
        String k = entry.getKey();

        if (map.get(k) >= 10) {
          if (wordCounts.containsKey(key.toString()) && wordCounts.containsKey(k)) {
            int px = wordCounts.get(key.toString());
            int py = wordCounts.get(k);
            pmi = Math.log10(((double) (map.get(k)) / (px * py)) * wordCounts.get("numLines*"));
            writeMap.put(k, (float) pmi);
          }
        }
      }
      if (writeMap.size() > 0) {
        context.write(key, writeMap);
      }
    }
Пример #18
0
  public void map(LongWritable key, Text value, Context context)
      throws IOException, InterruptedException {

    // System.out.println("in mapper, input "+ key + " " + value + ";");
    // userRow = null;
    userRow = value.toString().split("\\s");
    if (userRow.length == 1) {
      userRow = null;
      return;
    }
    // friendList = null;
    friendList = userRow[1].split(",");
    for (i = 0; i < friendList.length; i++) {
      keyUser.set(new Text(friendList[i]));
      for (j = 0; j < friendList.length; j++) {
        if (j == i) {
          continue;
        }
        suggTuple.set(friendList[j] + ",1");
        context.write(keyUser, suggTuple);
        // System.out.println(keyUser + ",(" + suggTuple + ")");
      }
      existingFriend.set(userRow[0] + ",-1");
      context.write(keyUser, existingFriend);
      // System.out.println(keyUser + ",(" + existingFriend + ")");

    }

    /*DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss");
    Date date = new Date();
    System.out.println("Mapper done at: " + dateFormat.format(date)); //2014/08/06 15:59:48*/
  }
Пример #19
0
    /**
     * {@inheritDoc}
     *
     * @see org.apache.hadoop.mapred.Mapper#map(java.lang.Object, java.lang.Object,
     *     org.apache.hadoop.mapred.OutputCollector, org.apache.hadoop.mapred.Reporter)
     */
    @Override
    public void map(
        LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter)
        throws IOException {
      String line = m_caseSensitive ? value.toString() : value.toString().toLowerCase();

      for (String pattern : m_patternsToSkip) {
        line = line.replaceAll(pattern, "");
      }

      StringTokenizer tokenizer = new StringTokenizer(line);
      while (tokenizer.hasMoreTokens()) {
        m_word.set(tokenizer.nextToken());
        output.collect(m_word, ONE);
        reporter.incrCounter(Counters.INPUT_WORDS, 1);
      }

      if ((++m_numRecords % 100) == 0) {
        reporter.setStatus(
            "Finished processing "
                + m_numRecords
                + " records "
                + "from the input file: "
                + m_inputFile);
      }
    }
Пример #20
0
    public void map(Object key, Text value, Context context)
        throws IOException, InterruptedException {

      NodeWritable n = new NodeWritable(value.toString().trim());

      // Emit node to carry forward the Model.
      NodeWritable p = new NodeWritable(value.toString().trim());
      p.setIsNode(new Text("YES"));
      p.setIsInList(new Text("***"));
      context.write(new Text(p.getNid().toString()), p);

      // For Each OutLinks Emit This Node
      for (NodeWritable x : n.getOuts()) {
        if (!x.getNid().toString().equals(n.getNid().toString())) {
          n.setIsInList(new Text("YES"));
          n.setIsNode(new Text("NO"));
          context.write(new Text(x.getNid().toString()), n);
        }
      }

      // For Each Inlinks Emit This Node
      for (NodeWritable x : n.getIns()) {
        if (!x.getNid().toString().equals(n.getNid().toString())) {
          n.setIsInList(new Text("NO"));
          n.setIsNode(new Text("NO"));
          context.write(new Text(x.getNid().toString()), n);
        }
      }
    }
Пример #21
0
    // specify input and out keys
    public void map(
        LongWritable key, Text value, OutputCollector<Text, Text> output, Reporter reporter)
        throws IOException {
      String line = value.toString(); // define new variable to be string

      ArrayList<Integer> range = new ArrayList<Integer>();
      for (int i = 2000; i <= 2010; i++) {
        range.add(i);
      }

      // String[] inputs = line.split(",(?=([^\"]*\"[^\"]*\")*[^\"]*$)");
      String[] inputs = line.split(",");

      try {

        int year = Integer.parseInt(inputs[165]);

        if (range.contains(year)) {
          String dur = inputs[3];
          String artist_name = inputs[2];
          String song_title = inputs[1];
          String final_input = artist_name + ',' + dur + ',' + song_title;
          Final_Value.set(final_input);
          output.collect(Final_Value, dummy);
        }
      } catch (NumberFormatException e) {
        // do nothing
      }
    }
Пример #22
0
  public final void readFields(DataInput in) throws IOException {
    metadata.clear();
    int sizeOrVersion = in.readInt();
    if (sizeOrVersion < 0) { // version
      version = sizeOrVersion;
      switch (version) {
        case VERSION:
          url = Text.readString(in);
          base = Text.readString(in);

          content = new byte[in.readInt()];
          in.readFully(content);

          contentType = Text.readString(in);
          metadata.readFields(in);
          break;
        default:
          throw new VersionMismatchException((byte) VERSION, (byte) version);
      }
    } else { // size
      byte[] compressed = new byte[sizeOrVersion];
      in.readFully(compressed, 0, compressed.length);
      ByteArrayInputStream deflated = new ByteArrayInputStream(compressed);
      DataInput inflater = new DataInputStream(new InflaterInputStream(deflated));
      readFieldsCompressed(inflater);
    }
  }
  @Override
  public boolean nextKeyValue() throws IOException, InterruptedException {
    if (key == null) {
      key = new Text();
    }
    if (value == null) {
      value = new Text();
    }
    Text edge = new Text();
    int newSize = 0;
    newSize =
        in.readLine(edge, Integer.MAX_VALUE, (int) Math.min((long) Integer.MAX_VALUE, end - pos));

    if (newSize == 0) {
      key = null;
      value = null;
      return false;
    } else {
      String[] dataArray = edge.toString().split("\t");
      if (dataArray.length < 2) {
        dataArray = edge.toString().split(" ");
      }
      key.set(dataArray[0]);
      value.set(dataArray[1]);
      pos += newSize;
      return true;
    }
  }
Пример #24
0
    /** Run a FileOperation */
    public void map(
        Text key,
        PolicyInfo policy,
        OutputCollector<WritableComparable, Text> out,
        Reporter reporter)
        throws IOException {
      this.reporter = reporter;
      try {
        LOG.info("Raiding file=" + key.toString() + " policy=" + policy);
        Path p = new Path(key.toString());
        FileStatus fs = p.getFileSystem(jobconf).getFileStatus(p);
        st.clear();
        RaidNode.doRaid(jobconf, policy, fs, st, reporter);

        ++succeedcount;

        reporter.incrCounter(Counter.PROCESSED_BLOCKS, st.numProcessedBlocks);
        reporter.incrCounter(Counter.PROCESSED_SIZE, st.processedSize);
        reporter.incrCounter(Counter.META_BLOCKS, st.numMetaBlocks);
        reporter.incrCounter(Counter.META_SIZE, st.metaSize);

        reporter.incrCounter(Counter.FILES_SUCCEEDED, 1);
      } catch (IOException e) {
        ++failcount;
        reporter.incrCounter(Counter.FILES_FAILED, 1);

        String s = "FAIL: " + policy + ", " + key + " " + StringUtils.stringifyException(e);
        out.collect(null, new Text(s));
        LOG.info(s);
      } finally {
        reporter.setStatus(getCountString());
      }
    }
Пример #25
0
    @Override
    public void map(LongWritable key, Text value, Context context)
        throws IOException, InterruptedException {

      // Fetch words from value.
      String[] words = value.toString().split(Utils.delim);
      String decade = words[0],
          w1 = words[1],
          w2 = words[2],
          cW1 = words[3],
          cW2 = words[4],
          cW1W2 = words[5],
          cDecade = cDecades[Integer.parseInt(decade) - Utils.minDecade],
          pmi;

      if (cDecade.equals("-1")) {
        logger.severe("Unsupported decade: " + decade);
        return;
      }

      pmi =
          calculatePMI(
              Double.parseDouble(cW1),
              Double.parseDouble(cW2),
              Double.parseDouble(cW1W2),
              Double.parseDouble(cDecade));

      decadePmi.set(decade, pmi);
      newValue.set(w1 + Utils.delim + w2);
      context.write(decadePmi, newValue);
    }
 public boolean nextKeyValue() throws IOException {
   if (offset >= length) {
     return false;
   }
   int read = 0;
   while (read < RECORD_LENGTH) {
     long newRead = in.read(buffer, read, RECORD_LENGTH - read);
     if (newRead == -1) {
       if (read == 0) {
         return false;
       } else {
         throw new EOFException("read past eof");
       }
     }
     read += newRead;
   }
   if (key == null) {
     key = new Text();
   }
   if (value == null) {
     value = new Text();
   }
   key.set(buffer, 0, KEY_LENGTH);
   value.set(buffer, KEY_LENGTH, VALUE_LENGTH);
   offset += RECORD_LENGTH;
   return true;
 }
Пример #27
0
 @Override
 public void readFields(DataInput dataInput) throws IOException {
   leftCoordinate = dataInput.readLong();
   samRecord = Text.readString(dataInput);
   rfname = Text.readString(dataInput);
   // sqTag = Text.readString(dataInput);
 }
 protected boolean next(Text key, Text value) throws IOException {
   if (fsin.getPos() < end) {
     try {
       if (readUntilMatch(START_TITLE_MARKER, false)) {
         if (readUntilMatch(END_TITLE_MARKER, true)) {
           int stop = buffer.getLength() - END_TITLE_MARKER.length;
           key.set(buffer.getData(), 0, stop);
           buffer.reset();
           if (readUntilMatch(START_TEXT_MARKER, false)) {
             if (readUntilMatch(END_TEXT_MARKER, true)) {
               // un-escape the XML entities encoding and
               // re-encode the result as raw UTF8 bytes
               stop = buffer.getLength() - END_TITLE_MARKER.length;
               String xmlEscapedContent = new String(buffer.getData(), 0, stop + 1, UTF8);
               value.set(StringEscapeUtils.unescapeXml(xmlEscapedContent).getBytes(UTF8));
               return true;
             }
           }
         }
       }
     } finally {
       buffer.reset();
     }
   }
   return false;
 }
Пример #29
0
 public boolean equals(Object bw) {
   if (bw instanceof BigramWritable) {
     BigramWritable ow = (BigramWritable) bw;
     return leftBigram.equals(ow.leftBigram) && rightBigram.equals(ow.rightBigram);
   }
   return false;
 }
  /**
   * Result file contains hierarchy of workerID-resultvar(incl filename). We deduplicate on the
   * workerID. Without JVM reuse each task refers to a unique workerID, so we will not find any
   * duplicates. With JVM reuse, however, each slot refers to a workerID, and there are duplicate
   * filenames due to partial aggregation and overwrite of fname (the RemoteParWorkerMapper ensures
   * uniqueness of those files independent of the runtime implementation).
   *
   * @param job
   * @param fname
   * @return
   * @throws DMLRuntimeException
   */
  @SuppressWarnings("deprecation")
  public static LocalVariableMap[] readResultFile(JobConf job, String fname)
      throws DMLRuntimeException, IOException {
    HashMap<Long, LocalVariableMap> tmp = new HashMap<Long, LocalVariableMap>();

    FileSystem fs = FileSystem.get(job);
    Path path = new Path(fname);
    LongWritable key = new LongWritable(); // workerID
    Text value = new Text(); // serialized var header (incl filename)

    int countAll = 0;
    for (Path lpath : MatrixReader.getSequenceFilePaths(fs, path)) {
      SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.get(job), lpath, job);
      try {
        while (reader.next(key, value)) {
          // System.out.println("key="+key.get()+", value="+value.toString());
          if (!tmp.containsKey(key.get())) tmp.put(key.get(), new LocalVariableMap());
          Object[] dat = ProgramConverter.parseDataObject(value.toString());
          tmp.get(key.get()).put((String) dat[0], (Data) dat[1]);
          countAll++;
        }
      } finally {
        if (reader != null) reader.close();
      }
    }

    LOG.debug("Num remote worker results (before deduplication): " + countAll);
    LOG.debug("Num remote worker results: " + tmp.size());

    // create return array
    return tmp.values().toArray(new LocalVariableMap[0]);
  }