protected boolean next(Text key, Text value) throws IOException { if (fsin.getPos() < end) { try { if (readUntilMatch(START_TITLE_MARKER, false)) { if (readUntilMatch(END_TITLE_MARKER, true)) { int stop = buffer.getLength() - END_TITLE_MARKER.length; key.set(buffer.getData(), 0, stop); buffer.reset(); if (readUntilMatch(START_TEXT_MARKER, false)) { if (readUntilMatch(END_TEXT_MARKER, true)) { // un-escape the XML entities encoding and // re-encode the result as raw UTF8 bytes stop = buffer.getLength() - END_TITLE_MARKER.length; String xmlEscapedContent = new String(buffer.getData(), 0, stop + 1, UTF8); value.set(StringEscapeUtils.unescapeXml(xmlEscapedContent).getBytes(UTF8)); return true; } } } } } finally { buffer.reset(); } } return false; }
public boolean next(LongWritable key, Text value) throws IOException { if (pos < end) { if (readUntilMatch(startTag, false)) { recordStartPos = pos - startTag.length; try { buffer.write(startTag); if (readUntilMatch(endTag, true)) { key.set(recordStartPos); value.set(buffer.getData(), 0, buffer.getLength()); return true; } } finally { // Because input streams of gzipped files are not // seekable (specifically, do not support getPos), we // need to keep track of bytes consumed ourselves. // This is a sanity check to make sure our internal // computation of bytes consumed is accurate. This // should be removed later for efficiency once we // confirm that this code works correctly. if (fsin instanceof Seekable) { if (pos != ((Seekable) fsin).getPos()) { throw new RuntimeException("bytes consumed error!"); } } buffer.reset(); } } } return false; }
static byte[] write(Writable writable) { DataOutputBuffer buffer = new DataOutputBuffer(); buffer.reset(); try { writable.write(buffer); } catch (IOException e) { throw new AssertionError(e); } return Arrays.copyOf(buffer.getData(), buffer.getLength()); }
/** Read raw bytes from a SequenceFile. */ public synchronized boolean nextKeyValue() throws IOException, InterruptedException { if (done) { return false; } long pos = in.getPosition(); key.set(pos); info.setPosition(pos); boolean eof = -1 == in.nextRawKey(buffer); if (!eof) { in.nextRawValue(vbytes); value.set( buffer.getLength(), vbytes.getSize(), (int) (in.getPosition() - pos), in.syncSeen()); } buffer.reset(); return !(done = (eof || (pos >= end && in.syncSeen()))); }
@Test public void testReadWriteReplicaState() { try { DataOutputBuffer out = new DataOutputBuffer(); DataInputBuffer in = new DataInputBuffer(); for (HdfsServerConstants.ReplicaState repState : HdfsServerConstants.ReplicaState.values()) { repState.write(out); in.reset(out.getData(), out.getLength()); HdfsServerConstants.ReplicaState result = HdfsServerConstants.ReplicaState.read(in); assertTrue("testReadWrite error !!!", repState == result); out.reset(); in.reset(); } } catch (Exception ex) { fail("testReadWrite ex error ReplicaState"); } }
public int read() throws IOException { int ret; if (null == inbuf || -1 == (ret = inbuf.read())) { if (!r.next(key, val)) { return -1; } byte[] tmp = key.toString().getBytes(); outbuf.write(tmp, 0, tmp.length); outbuf.write('\t'); tmp = val.toString().getBytes(); outbuf.write(tmp, 0, tmp.length); outbuf.write('\n'); inbuf.reset(outbuf.getData(), outbuf.getLength()); outbuf.reset(); ret = inbuf.read(); } return ret; }
@Override public boolean nextKeyValue() throws IOException, InterruptedException { if (fsin.getPos() < end) { if (readUntilMatch(startTag, false)) { try { buffer.write(startTag); if (readUntilMatch(endTag, true)) { key.set(fsin.getPos()); value.set(buffer.getData(), 0, buffer.getLength()); return true; } } finally { buffer.reset(); } } } return false; }
@Override // Read the page header // -1: EOF // 1 - outside the <page> tag // 2 - just passed the <page> tag but outside the <title> // 3 - just passed the <title> tag // 4 - just passed the </title> tag but outside the <namespace> // 5 - just passed the <namespace> // 6 - just passed the </namespace> but outside the <id> // 7 - just passed the (page's) <id> // 8 - just passed the </id> tag but outside the <revision> // 9 - (optionally) just passed the <redirect> // 10 - just passed the (next) <revision> protected Ack readToPageHeader(RevisionHeader meta) throws IOException { int i = 0; int flag = 2; boolean skipped = false; int revOrRedirect = -1; try (DataOutputBuffer pageTitle = new DataOutputBuffer(); DataOutputBuffer nsBuf = new DataOutputBuffer(); DataOutputBuffer keyBuf = new DataOutputBuffer()) { while (true) { if (!fetchMore()) return Ack.EOF; while (hasData()) { byte b = nextByte(); // when passing the namespace and we realize that // this is not an article, and that the option of skipping // non-article pages is on, we simply skip everything until // the closing </page> if (skipped) { if (flag >= 6) { Log.warn("Peculiar read after skipping namespace"); /* if (b == END_PAGE[i]) { i++; } else i = 0; if (i >= END_PAGE.length) { return Ack.SKIPPED; } */ return Ack.FAILED; } else return Ack.SKIPPED; } if (flag == 2) { if (b == START_TITLE[i]) { i++; } else i = 0; if (i >= START_TITLE.length) { flag = 3; i = 0; } } // put everything between <title></title> block into title else if (flag == 3) { if (b == END_TITLE[i]) { i++; } else i = 0; pageTitle.write(b); if (i >= END_TITLE.length) { flag = 4; String title = new String(pageTitle.getData(), 0, pageTitle.getLength() - END_TITLE.length); meta.setPageTitle(title); pageTitle.reset(); i = 0; } } else if (flag == 4) { if (b == START_NAMESPACE[i]) { i++; } else i = 0; if (i >= START_NAMESPACE.length) { flag = 5; i = 0; } } else if (flag == 5) { if (b == END_NAMESPACE[i]) { i++; } else i = 0; nsBuf.write(b); if (i >= END_NAMESPACE.length) { flag = 6; String nsStr = new String(nsBuf.getData(), 0, nsBuf.getLength() - END_NAMESPACE.length); int ns = Integer.parseInt(nsStr); nsBuf.reset(); if (ns != 0) { if (skipNonArticles) { skipped = true; meta.clear(); return Ack.SKIPPED; } } meta.setNamespace(ns); i = 0; } } else if (flag == 6) { if (b == START_ID[i]) { i++; } else i = 0; if (i >= START_ID.length) { flag = 7; i = 0; } } // put everything in outer <id></id> block into keyBuf else if (flag == 7) { if (b == END_ID[i]) { i++; } else i = 0; keyBuf.write(b); if (i >= END_ID.length) { flag = 8; String idStr = new String(keyBuf.getData(), 0, keyBuf.getLength() - END_ID.length); long pageId = Long.parseLong(idStr); meta.setPageId(pageId); i = 0; } } else if (flag == 8) { int curMatch = 0; if ((i < START_REVISION.length && b == START_REVISION[i]) && (i < START_REDIRECT.length && b == START_REDIRECT[i]) // subtle bug here: some tag names can overlap // multiple times && (revOrRedirect == 3 || revOrRedirect == -1)) { curMatch = 3; } else if (i < START_REVISION.length && b == START_REVISION[i] && revOrRedirect != 2) { curMatch = 1; } else if (i < START_REDIRECT.length && b == START_REDIRECT[i] && revOrRedirect != 1) { curMatch = 2; } else { curMatch = 0; } if (curMatch > 0 && (i == 0 || revOrRedirect == 3 || curMatch == revOrRedirect)) { i++; revOrRedirect = curMatch; } else i = 0; if ((revOrRedirect == 2 || revOrRedirect == 3) && i >= START_REDIRECT.length) { if (skipRedirect) { skipped = true; meta.clear(); return Ack.SKIPPED; } revOrRedirect = -1; flag = 9; i = 0; } else if ((revOrRedirect == 1 || revOrRedirect == 3) && i >= START_REVISION.length) { flag = 10; revOrRedirect = -1; return Ack.PASSED_TO_NEXT_TAG; } } else if (flag == 9 && !skipRedirect) { if (b == START_REVISION[i]) { i++; } else i = 0; if (i >= START_REVISION.length) { flag = 10; return Ack.PASSED_TO_NEXT_TAG; } } } } } }
/** * create inmemory segments * * @return * @throws IOException */ public List<TezMerger.Segment> createInMemStreams() throws IOException { int numberOfStreams = Math.max(2, rnd.nextInt(10)); LOG.info("No of streams : " + numberOfStreams); SerializationFactory serializationFactory = new SerializationFactory(conf); Serializer keySerializer = serializationFactory.getSerializer(keyClass); Serializer valueSerializer = serializationFactory.getSerializer(valClass); LocalDirAllocator localDirAllocator = new LocalDirAllocator(TezRuntimeFrameworkConfigs.LOCAL_DIRS); InputContext context = createTezInputContext(); MergeManager mergeManager = new MergeManager( conf, fs, localDirAllocator, context, null, null, null, null, null, 1024 * 1024 * 10, null, false, -1); DataOutputBuffer keyBuf = new DataOutputBuffer(); DataOutputBuffer valBuf = new DataOutputBuffer(); DataInputBuffer keyIn = new DataInputBuffer(); DataInputBuffer valIn = new DataInputBuffer(); keySerializer.open(keyBuf); valueSerializer.open(valBuf); List<TezMerger.Segment> segments = new LinkedList<TezMerger.Segment>(); for (int i = 0; i < numberOfStreams; i++) { BoundedByteArrayOutputStream bout = new BoundedByteArrayOutputStream(1024 * 1024); InMemoryWriter writer = new InMemoryWriter(bout); Map<Writable, Writable> data = createData(); // write data for (Map.Entry<Writable, Writable> entry : data.entrySet()) { keySerializer.serialize(entry.getKey()); valueSerializer.serialize(entry.getValue()); keyIn.reset(keyBuf.getData(), 0, keyBuf.getLength()); valIn.reset(valBuf.getData(), 0, valBuf.getLength()); writer.append(keyIn, valIn); originalData.put(entry.getKey(), entry.getValue()); keyBuf.reset(); valBuf.reset(); keyIn.reset(); valIn.reset(); } IFile.Reader reader = new InMemoryReader(mergeManager, null, bout.getBuffer(), 0, bout.getBuffer().length); segments.add(new TezMerger.Segment(reader, true)); data.clear(); writer.close(); } return segments; }
public void testBinary() throws IOException, InterruptedException { Configuration conf = new Configuration(); Job job = new Job(conf); Path outdir = new Path(System.getProperty("test.build.data", "/tmp"), "outseq"); Random r = new Random(); long seed = r.nextLong(); r.setSeed(seed); FileOutputFormat.setOutputPath(job, outdir); SequenceFileAsBinaryOutputFormat.setSequenceFileOutputKeyClass(job, IntWritable.class); SequenceFileAsBinaryOutputFormat.setSequenceFileOutputValueClass(job, DoubleWritable.class); SequenceFileAsBinaryOutputFormat.setCompressOutput(job, true); SequenceFileAsBinaryOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); BytesWritable bkey = new BytesWritable(); BytesWritable bval = new BytesWritable(); TaskAttemptContext context = MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration()); OutputFormat<BytesWritable, BytesWritable> outputFormat = new SequenceFileAsBinaryOutputFormat(); OutputCommitter committer = outputFormat.getOutputCommitter(context); committer.setupJob(job); RecordWriter<BytesWritable, BytesWritable> writer = outputFormat.getRecordWriter(context); IntWritable iwritable = new IntWritable(); DoubleWritable dwritable = new DoubleWritable(); DataOutputBuffer outbuf = new DataOutputBuffer(); LOG.info("Creating data by SequenceFileAsBinaryOutputFormat"); try { for (int i = 0; i < RECORDS; ++i) { iwritable = new IntWritable(r.nextInt()); iwritable.write(outbuf); bkey.set(outbuf.getData(), 0, outbuf.getLength()); outbuf.reset(); dwritable = new DoubleWritable(r.nextDouble()); dwritable.write(outbuf); bval.set(outbuf.getData(), 0, outbuf.getLength()); outbuf.reset(); writer.write(bkey, bval); } } finally { writer.close(context); } committer.commitTask(context); committer.commitJob(job); InputFormat<IntWritable, DoubleWritable> iformat = new SequenceFileInputFormat<IntWritable, DoubleWritable>(); int count = 0; r.setSeed(seed); SequenceFileInputFormat.setInputPaths(job, outdir); LOG.info("Reading data by SequenceFileInputFormat"); for (InputSplit split : iformat.getSplits(job)) { RecordReader<IntWritable, DoubleWritable> reader = iformat.createRecordReader(split, context); MapContext<IntWritable, DoubleWritable, BytesWritable, BytesWritable> mcontext = new MapContextImpl<IntWritable, DoubleWritable, BytesWritable, BytesWritable>( job.getConfiguration(), context.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), split); reader.initialize(split, mcontext); try { int sourceInt; double sourceDouble; while (reader.nextKeyValue()) { sourceInt = r.nextInt(); sourceDouble = r.nextDouble(); iwritable = reader.getCurrentKey(); dwritable = reader.getCurrentValue(); assertEquals( "Keys don't match: " + "*" + iwritable.get() + ":" + sourceInt + "*", sourceInt, iwritable.get()); assertTrue( "Vals don't match: " + "*" + dwritable.get() + ":" + sourceDouble + "*", Double.compare(dwritable.get(), sourceDouble) == 0); ++count; } } finally { reader.close(); } } assertEquals("Some records not found", RECORDS, count); }