protected boolean next(Text key, Text value) throws IOException {
   if (fsin.getPos() < end) {
     try {
       if (readUntilMatch(START_TITLE_MARKER, false)) {
         if (readUntilMatch(END_TITLE_MARKER, true)) {
           int stop = buffer.getLength() - END_TITLE_MARKER.length;
           key.set(buffer.getData(), 0, stop);
           buffer.reset();
           if (readUntilMatch(START_TEXT_MARKER, false)) {
             if (readUntilMatch(END_TEXT_MARKER, true)) {
               // un-escape the XML entities encoding and
               // re-encode the result as raw UTF8 bytes
               stop = buffer.getLength() - END_TITLE_MARKER.length;
               String xmlEscapedContent = new String(buffer.getData(), 0, stop + 1, UTF8);
               value.set(StringEscapeUtils.unescapeXml(xmlEscapedContent).getBytes(UTF8));
               return true;
             }
           }
         }
       }
     } finally {
       buffer.reset();
     }
   }
   return false;
 }
示例#2
0
    public boolean next(LongWritable key, Text value) throws IOException {
      if (pos < end) {
        if (readUntilMatch(startTag, false)) {
          recordStartPos = pos - startTag.length;

          try {
            buffer.write(startTag);
            if (readUntilMatch(endTag, true)) {
              key.set(recordStartPos);
              value.set(buffer.getData(), 0, buffer.getLength());
              return true;
            }
          } finally {
            // Because input streams of gzipped files are not
            // seekable (specifically, do not support getPos), we
            // need to keep track of bytes consumed ourselves.

            // This is a sanity check to make sure our internal
            // computation of bytes consumed is accurate. This
            // should be removed later for efficiency once we
            // confirm that this code works correctly.

            if (fsin instanceof Seekable) {
              if (pos != ((Seekable) fsin).getPos()) {
                throw new RuntimeException("bytes consumed error!");
              }
            }

            buffer.reset();
          }
        }
      }
      return false;
    }
示例#3
0
 static byte[] write(Writable writable) {
   DataOutputBuffer buffer = new DataOutputBuffer();
   buffer.reset();
   try {
     writable.write(buffer);
   } catch (IOException e) {
     throw new AssertionError(e);
   }
   return Arrays.copyOf(buffer.getData(), buffer.getLength());
 }
 /** Read raw bytes from a SequenceFile. */
 public synchronized boolean nextKeyValue() throws IOException, InterruptedException {
   if (done) {
     return false;
   }
   long pos = in.getPosition();
   key.set(pos);
   info.setPosition(pos);
   boolean eof = -1 == in.nextRawKey(buffer);
   if (!eof) {
     in.nextRawValue(vbytes);
     value.set(
         buffer.getLength(), vbytes.getSize(), (int) (in.getPosition() - pos), in.syncSeen());
   }
   buffer.reset();
   return !(done = (eof || (pos >= end && in.syncSeen())));
 }
示例#5
0
 @Test
 public void testReadWriteReplicaState() {
   try {
     DataOutputBuffer out = new DataOutputBuffer();
     DataInputBuffer in = new DataInputBuffer();
     for (HdfsServerConstants.ReplicaState repState : HdfsServerConstants.ReplicaState.values()) {
       repState.write(out);
       in.reset(out.getData(), out.getLength());
       HdfsServerConstants.ReplicaState result = HdfsServerConstants.ReplicaState.read(in);
       assertTrue("testReadWrite error !!!", repState == result);
       out.reset();
       in.reset();
     }
   } catch (Exception ex) {
     fail("testReadWrite ex error ReplicaState");
   }
 }
示例#6
0
 public int read() throws IOException {
   int ret;
   if (null == inbuf || -1 == (ret = inbuf.read())) {
     if (!r.next(key, val)) {
       return -1;
     }
     byte[] tmp = key.toString().getBytes();
     outbuf.write(tmp, 0, tmp.length);
     outbuf.write('\t');
     tmp = val.toString().getBytes();
     outbuf.write(tmp, 0, tmp.length);
     outbuf.write('\n');
     inbuf.reset(outbuf.getData(), outbuf.getLength());
     outbuf.reset();
     ret = inbuf.read();
   }
   return ret;
 }
示例#7
0
 @Override
 public boolean nextKeyValue() throws IOException, InterruptedException {
   if (fsin.getPos() < end) {
     if (readUntilMatch(startTag, false)) {
       try {
         buffer.write(startTag);
         if (readUntilMatch(endTag, true)) {
           key.set(fsin.getPos());
           value.set(buffer.getData(), 0, buffer.getLength());
           return true;
         }
       } finally {
         buffer.reset();
       }
     }
   }
   return false;
 }
  @Override
  // Read the page header
  // -1: EOF
  // 1 - outside the <page> tag
  // 2 - just passed the <page> tag but outside the <title>
  // 3 - just passed the <title> tag
  // 4 - just passed the </title> tag but outside the <namespace>
  // 5 - just passed the <namespace>
  // 6 - just passed the </namespace> but outside the <id>
  // 7 - just passed the (page's) <id>
  // 8 - just passed the </id> tag but outside the <revision>
  // 9 - (optionally) just passed the <redirect>
  // 10 - just passed the (next) <revision>
  protected Ack readToPageHeader(RevisionHeader meta) throws IOException {
    int i = 0;
    int flag = 2;
    boolean skipped = false;
    int revOrRedirect = -1;
    try (DataOutputBuffer pageTitle = new DataOutputBuffer();
        DataOutputBuffer nsBuf = new DataOutputBuffer();
        DataOutputBuffer keyBuf = new DataOutputBuffer()) {

      while (true) {
        if (!fetchMore()) return Ack.EOF;
        while (hasData()) {
          byte b = nextByte();

          // when passing the namespace and we realize that
          // this is not an article, and that the option of skipping
          // non-article pages is on, we simply skip everything until
          // the closing </page>
          if (skipped) {
            if (flag >= 6) {
              Log.warn("Peculiar read after skipping namespace");
              /*
              if (b == END_PAGE[i]) {
              	i++;
              } else i = 0;
              if (i >= END_PAGE.length) {
              	return Ack.SKIPPED;
              } */
              return Ack.FAILED;
            } else return Ack.SKIPPED;
          }

          if (flag == 2) {
            if (b == START_TITLE[i]) {
              i++;
            } else i = 0;
            if (i >= START_TITLE.length) {
              flag = 3;
              i = 0;
            }
          }

          // put everything between <title></title> block into title
          else if (flag == 3) {
            if (b == END_TITLE[i]) {
              i++;
            } else i = 0;
            pageTitle.write(b);
            if (i >= END_TITLE.length) {
              flag = 4;
              String title =
                  new String(pageTitle.getData(), 0, pageTitle.getLength() - END_TITLE.length);
              meta.setPageTitle(title);
              pageTitle.reset();
              i = 0;
            }
          } else if (flag == 4) {
            if (b == START_NAMESPACE[i]) {
              i++;
            } else i = 0;
            if (i >= START_NAMESPACE.length) {
              flag = 5;
              i = 0;
            }
          } else if (flag == 5) {
            if (b == END_NAMESPACE[i]) {
              i++;
            } else i = 0;
            nsBuf.write(b);
            if (i >= END_NAMESPACE.length) {
              flag = 6;
              String nsStr =
                  new String(nsBuf.getData(), 0, nsBuf.getLength() - END_NAMESPACE.length);
              int ns = Integer.parseInt(nsStr);
              nsBuf.reset();
              if (ns != 0) {
                if (skipNonArticles) {
                  skipped = true;
                  meta.clear();
                  return Ack.SKIPPED;
                }
              }
              meta.setNamespace(ns);
              i = 0;
            }
          } else if (flag == 6) {
            if (b == START_ID[i]) {
              i++;
            } else i = 0;
            if (i >= START_ID.length) {
              flag = 7;
              i = 0;
            }
          }

          // put everything in outer <id></id> block into keyBuf
          else if (flag == 7) {
            if (b == END_ID[i]) {
              i++;
            } else i = 0;
            keyBuf.write(b);
            if (i >= END_ID.length) {
              flag = 8;
              String idStr = new String(keyBuf.getData(), 0, keyBuf.getLength() - END_ID.length);
              long pageId = Long.parseLong(idStr);
              meta.setPageId(pageId);
              i = 0;
            }
          } else if (flag == 8) {
            int curMatch = 0;
            if ((i < START_REVISION.length && b == START_REVISION[i])
                && (i < START_REDIRECT.length && b == START_REDIRECT[i])

                // subtle bug here: some tag names can overlap
                // multiple times
                && (revOrRedirect == 3 || revOrRedirect == -1)) {
              curMatch = 3;
            } else if (i < START_REVISION.length && b == START_REVISION[i] && revOrRedirect != 2) {
              curMatch = 1;
            } else if (i < START_REDIRECT.length && b == START_REDIRECT[i] && revOrRedirect != 1) {
              curMatch = 2;
            } else {
              curMatch = 0;
            }
            if (curMatch > 0 && (i == 0 || revOrRedirect == 3 || curMatch == revOrRedirect)) {
              i++;
              revOrRedirect = curMatch;
            } else i = 0;
            if ((revOrRedirect == 2 || revOrRedirect == 3) && i >= START_REDIRECT.length) {
              if (skipRedirect) {
                skipped = true;
                meta.clear();
                return Ack.SKIPPED;
              }
              revOrRedirect = -1;
              flag = 9;
              i = 0;
            } else if ((revOrRedirect == 1 || revOrRedirect == 3) && i >= START_REVISION.length) {
              flag = 10;
              revOrRedirect = -1;
              return Ack.PASSED_TO_NEXT_TAG;
            }
          } else if (flag == 9 && !skipRedirect) {
            if (b == START_REVISION[i]) {
              i++;
            } else i = 0;
            if (i >= START_REVISION.length) {
              flag = 10;
              return Ack.PASSED_TO_NEXT_TAG;
            }
          }
        }
      }
    }
  }
示例#9
0
  /**
   * create inmemory segments
   *
   * @return
   * @throws IOException
   */
  public List<TezMerger.Segment> createInMemStreams() throws IOException {
    int numberOfStreams = Math.max(2, rnd.nextInt(10));
    LOG.info("No of streams : " + numberOfStreams);

    SerializationFactory serializationFactory = new SerializationFactory(conf);
    Serializer keySerializer = serializationFactory.getSerializer(keyClass);
    Serializer valueSerializer = serializationFactory.getSerializer(valClass);

    LocalDirAllocator localDirAllocator =
        new LocalDirAllocator(TezRuntimeFrameworkConfigs.LOCAL_DIRS);
    InputContext context = createTezInputContext();
    MergeManager mergeManager =
        new MergeManager(
            conf,
            fs,
            localDirAllocator,
            context,
            null,
            null,
            null,
            null,
            null,
            1024 * 1024 * 10,
            null,
            false,
            -1);

    DataOutputBuffer keyBuf = new DataOutputBuffer();
    DataOutputBuffer valBuf = new DataOutputBuffer();
    DataInputBuffer keyIn = new DataInputBuffer();
    DataInputBuffer valIn = new DataInputBuffer();
    keySerializer.open(keyBuf);
    valueSerializer.open(valBuf);

    List<TezMerger.Segment> segments = new LinkedList<TezMerger.Segment>();
    for (int i = 0; i < numberOfStreams; i++) {
      BoundedByteArrayOutputStream bout = new BoundedByteArrayOutputStream(1024 * 1024);
      InMemoryWriter writer = new InMemoryWriter(bout);
      Map<Writable, Writable> data = createData();
      // write data
      for (Map.Entry<Writable, Writable> entry : data.entrySet()) {
        keySerializer.serialize(entry.getKey());
        valueSerializer.serialize(entry.getValue());
        keyIn.reset(keyBuf.getData(), 0, keyBuf.getLength());
        valIn.reset(valBuf.getData(), 0, valBuf.getLength());
        writer.append(keyIn, valIn);
        originalData.put(entry.getKey(), entry.getValue());
        keyBuf.reset();
        valBuf.reset();
        keyIn.reset();
        valIn.reset();
      }
      IFile.Reader reader =
          new InMemoryReader(mergeManager, null, bout.getBuffer(), 0, bout.getBuffer().length);
      segments.add(new TezMerger.Segment(reader, true));

      data.clear();
      writer.close();
    }
    return segments;
  }
  public void testBinary() throws IOException, InterruptedException {
    Configuration conf = new Configuration();
    Job job = new Job(conf);

    Path outdir = new Path(System.getProperty("test.build.data", "/tmp"), "outseq");
    Random r = new Random();
    long seed = r.nextLong();
    r.setSeed(seed);

    FileOutputFormat.setOutputPath(job, outdir);

    SequenceFileAsBinaryOutputFormat.setSequenceFileOutputKeyClass(job, IntWritable.class);
    SequenceFileAsBinaryOutputFormat.setSequenceFileOutputValueClass(job, DoubleWritable.class);

    SequenceFileAsBinaryOutputFormat.setCompressOutput(job, true);
    SequenceFileAsBinaryOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    BytesWritable bkey = new BytesWritable();
    BytesWritable bval = new BytesWritable();

    TaskAttemptContext context =
        MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration());
    OutputFormat<BytesWritable, BytesWritable> outputFormat =
        new SequenceFileAsBinaryOutputFormat();
    OutputCommitter committer = outputFormat.getOutputCommitter(context);
    committer.setupJob(job);
    RecordWriter<BytesWritable, BytesWritable> writer = outputFormat.getRecordWriter(context);

    IntWritable iwritable = new IntWritable();
    DoubleWritable dwritable = new DoubleWritable();
    DataOutputBuffer outbuf = new DataOutputBuffer();
    LOG.info("Creating data by SequenceFileAsBinaryOutputFormat");
    try {
      for (int i = 0; i < RECORDS; ++i) {
        iwritable = new IntWritable(r.nextInt());
        iwritable.write(outbuf);
        bkey.set(outbuf.getData(), 0, outbuf.getLength());
        outbuf.reset();
        dwritable = new DoubleWritable(r.nextDouble());
        dwritable.write(outbuf);
        bval.set(outbuf.getData(), 0, outbuf.getLength());
        outbuf.reset();
        writer.write(bkey, bval);
      }
    } finally {
      writer.close(context);
    }
    committer.commitTask(context);
    committer.commitJob(job);

    InputFormat<IntWritable, DoubleWritable> iformat =
        new SequenceFileInputFormat<IntWritable, DoubleWritable>();
    int count = 0;
    r.setSeed(seed);
    SequenceFileInputFormat.setInputPaths(job, outdir);
    LOG.info("Reading data by SequenceFileInputFormat");
    for (InputSplit split : iformat.getSplits(job)) {
      RecordReader<IntWritable, DoubleWritable> reader = iformat.createRecordReader(split, context);
      MapContext<IntWritable, DoubleWritable, BytesWritable, BytesWritable> mcontext =
          new MapContextImpl<IntWritable, DoubleWritable, BytesWritable, BytesWritable>(
              job.getConfiguration(),
              context.getTaskAttemptID(),
              reader,
              null,
              null,
              MapReduceTestUtil.createDummyReporter(),
              split);
      reader.initialize(split, mcontext);
      try {
        int sourceInt;
        double sourceDouble;
        while (reader.nextKeyValue()) {
          sourceInt = r.nextInt();
          sourceDouble = r.nextDouble();
          iwritable = reader.getCurrentKey();
          dwritable = reader.getCurrentValue();
          assertEquals(
              "Keys don't match: " + "*" + iwritable.get() + ":" + sourceInt + "*",
              sourceInt,
              iwritable.get());
          assertTrue(
              "Vals don't match: " + "*" + dwritable.get() + ":" + sourceDouble + "*",
              Double.compare(dwritable.get(), sourceDouble) == 0);
          ++count;
        }
      } finally {
        reader.close();
      }
    }
    assertEquals("Some records not found", RECORDS, count);
  }