private int readNext(Text text, int maxLineLength, int maxBytesToConsume) throws IOException { int offset = 0; text.clear(); Text tmp = new Text(); for (int i = 0; i < maxBytesToConsume; i++) { int offsetTmp = in.readLine(tmp, maxLineLength, maxBytesToConsume); offset += offsetTmp; Matcher m = delimiterPattern.matcher(tmp.toString()); // End of File if (offsetTmp == 0) { break; } if (m.matches()) { break; } else { // Append value to record text.append(EOL.getBytes(), 0, EOL.getLength()); text.append(tmp.getBytes(), 0, tmp.getLength()); } } return offset; }
public static void encodeText(DataOutputStream out, Text text) throws IOException { if (text == null) { out.writeInt(-1); return; } out.writeInt(text.getLength()); out.write(text.getBytes(), 0, text.getLength()); }
private void scanQseqLine(Text line, Text key, SequencedFragment fragment) { setFieldPositionsAndLengths(line); // Build the key. We concatenate all fields from 0 to 5 (machine to y-pos) // and then the read number, replacing the tabs with colons. key.clear(); // append up and including field[5] key.append(line.getBytes(), 0, fieldPositions[5] + fieldLengths[5]); // replace tabs with : byte[] bytes = key.getBytes(); int temporaryEnd = key.getLength(); for (int i = 0; i < temporaryEnd; ++i) if (bytes[i] == '\t') bytes[i] = ':'; // append the read number key.append( line.getBytes(), fieldPositions[7] - 1, fieldLengths[7] + 1); // +/- 1 to catch the preceding tab. // convert the tab preceding the read number into a : key.getBytes()[temporaryEnd] = ':'; // now the fragment try { fragment.clear(); fragment.setInstrument(Text.decode(line.getBytes(), fieldPositions[0], fieldLengths[0])); fragment.setRunNumber( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[1], fieldLengths[1]))); // fragment.setFlowcellId(); fragment.setLane( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[2], fieldLengths[2]))); fragment.setTile( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[3], fieldLengths[3]))); fragment.setXpos( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[4], fieldLengths[4]))); fragment.setYpos( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[5], fieldLengths[5]))); fragment.setRead( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[7], fieldLengths[7]))); fragment.setFilterPassed(line.getBytes()[fieldPositions[10]] != '0'); // fragment.setControlNumber(); if (fieldLengths[6] > 0 && line.getBytes()[fieldPositions[6]] == '0') // 0 is a null index sequence fragment.setIndexSequence(null); else fragment.setIndexSequence( Text.decode(line.getBytes(), fieldPositions[6], fieldLengths[6]).replace('.', 'N')); } catch (CharacterCodingException e) { throw new FormatException( "Invalid character format at " + makePositionMessage(this.pos - line.getLength()) + "; line: " + line); } fragment.getSequence().append(line.getBytes(), fieldPositions[8], fieldLengths[8]); fragment.getQuality().append(line.getBytes(), fieldPositions[9], fieldLengths[9]); }
/** * Test readLine for various kinds of line termination sequneces. Varies buffer size to stress * test. Also check that returned value matches the string length. * * @throws Exception */ @Test public void testNewLines() throws Exception { final String STR = "a\nbb\n\nccc\rdddd\r\r\r\n\r\neeeee"; final int STRLENBYTES = STR.getBytes().length; Text out = new Text(); for (int bufsz = 1; bufsz < STRLENBYTES + 1; ++bufsz) { LineReader in = makeStream(STR, bufsz); int c = 0; c += in.readLine(out); // "a"\n assertEquals("line1 length, bufsz:" + bufsz, 1, out.getLength()); c += in.readLine(out); // "bb"\n assertEquals("line2 length, bufsz:" + bufsz, 2, out.getLength()); c += in.readLine(out); // ""\n assertEquals("line3 length, bufsz:" + bufsz, 0, out.getLength()); c += in.readLine(out); // "ccc"\r assertEquals("line4 length, bufsz:" + bufsz, 3, out.getLength()); c += in.readLine(out); // dddd\r assertEquals("line5 length, bufsz:" + bufsz, 4, out.getLength()); c += in.readLine(out); // ""\r assertEquals("line6 length, bufsz:" + bufsz, 0, out.getLength()); c += in.readLine(out); // ""\r\n assertEquals("line7 length, bufsz:" + bufsz, 0, out.getLength()); c += in.readLine(out); // ""\r\n assertEquals("line8 length, bufsz:" + bufsz, 0, out.getLength()); c += in.readLine(out); // "eeeee"EOF assertEquals("line9 length, bufsz:" + bufsz, 5, out.getLength()); assertEquals("end of file, bufsz: " + bufsz, 0, in.readLine(out)); assertEquals("total bytes, bufsz: " + bufsz, c, STRLENBYTES); } }
/** Creates file {@code outfile} adding a newline between each element of {@code lines}. */ private void writeFile(Path outfile, List<String> lines) throws IOException { FSDataOutputStream dataOut = fileSystem.create(outfile); Text newline = new Text("\n"); Text textLine = new Text(); for (String line : lines) { textLine.set(line); dataOut.write(textLine.getBytes(), 0, textLine.getLength()); dataOut.write(newline.getBytes(), 0, newline.getLength()); } dataOut.close(); }
public boolean next(Text key, Text value) throws IOException { if (in.next(junk, line)) { if (line.getLength() < KEY_LENGTH) { key.set(line); value.clear(); } else { byte[] bytes = line.getBytes(); key.set(bytes, 0, KEY_LENGTH); value.set(bytes, KEY_LENGTH, line.getLength() - KEY_LENGTH); } return true; } else { return false; } }
@Override public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { aggs.reset(); for (Text value : values) { codec.decode(ByteBuffer.wrap(value.getBytes(), 0, value.getLength()), input); if (cuboidLevel > 0) { aggs.aggregate(input, needAggr); } else { aggs.aggregate(input); } } aggs.collectStates(result); ByteBuffer valueBuf = codec.encode(result); outputValue.set(valueBuf.array(), 0, valueBuf.position()); context.write(key, outputValue); counter++; if (counter % BatchConstants.NORMAL_RECORD_LOG_THRESHOLD == 0) { logger.info("Handled " + counter + " records!"); } }
public synchronized boolean next(LongWritable key, Text value) throws IOException { boolean gotsomething; boolean retval; byte space[] = {' '}; int counter = 0; String ln = null; value.clear(); gotsomething = false; do { retval = lineRecord.next(lineKey, lineValue); if (retval) { if (lineValue.toString().length() > 0) { ln = lineValue.toString(); lineValue.set( ln.split(" ")[ 0]); // here we basically get the first element from a KV such as '4847570 -1' byte[] rawline = lineValue.getBytes(); int rawlinelen = lineValue.getLength(); value.append(rawline, 0, rawlinelen); value.append(space, 0, 1); counter++; } gotsomething = true; } else { break; } } while (counter < MAX_LINE_COUNT); // System.out.println("ParagraphRecordReader::next() returns "+gotsomething+" after setting // value to: ["+value.toString()+"]"); return gotsomething; }
private int skipUtfByteOrderMark() throws IOException { // Strip BOM(Byte Order Mark) // Text only support UTF-8, we only need to check UTF-8 BOM // (0xEF,0xBB,0xBF) at the start of the text stream. int newMaxLineLength = (int) Math.min(3L + (long) maxLineLength, Integer.MAX_VALUE); int newSize = in.readLine(value, newMaxLineLength, maxBytesToConsume(pos)); // Even we read 3 extra bytes for the first line, // we won't alter existing behavior (no backwards incompat issue). // Because the newSize is less than maxLineLength and // the number of bytes copied to Text is always no more than newSize. // If the return size from readLine is not less than maxLineLength, // we will discard the current line and read the next line. pos += newSize; int textLength = value.getLength(); byte[] textBytes = value.getBytes(); if ((textLength >= 3) && (textBytes[0] == (byte) 0xEF) && (textBytes[1] == (byte) 0xBB) && (textBytes[2] == (byte) 0xBF)) { // find UTF-8 BOM, strip it. LOG.info("Found UTF-8 BOM and skipped it"); textLength -= 3; newSize -= 3; if (textLength > 0) { // It may work to use the same buffer and not do the copyBytes textBytes = value.copyBytes(); value.set(textBytes, 3, textLength); } else { value.clear(); } } return newSize; }
public synchronized boolean next(LongWritable key, Text value) throws IOException { boolean gotsomething; boolean retval; byte space[] = {' '}; int counter = 0; value.clear(); gotsomething = false; do { retval = lineRecord.next(lineKey, lineValue); if (retval) { if (lineValue.toString().length() > 0) { byte[] rawline = lineValue.getBytes(); int rawlinelen = lineValue.getLength(); value.append(rawline, 0, rawlinelen); value.append(space, 0, 1); counter++; } gotsomething = true; } else { break; } } while (counter < MAX_LINE_COUNT); return gotsomething; }
@Override public boolean next(LongWritable key, Text value) throws IOException { while (pos < end) { key.set(pos); int newSize = lineReader.readLine( value, maxLineLength, Math.max((int) Math.min(Integer.MAX_VALUE, end - pos), maxLineLength)); String strReplace = value.toString().replace("$#$", "\001"); Text txtReplace = new Text(); txtReplace.set(strReplace); value.set(txtReplace.getBytes(), 0, txtReplace.getLength()); if (newSize == 0) return false; pos += newSize; if (newSize < maxLineLength) return true; // line too long. try again log.info("Skipped line of size " + newSize + " at pos " + (pos - newSize)); } return false; }
public void map( LongWritable key, Text value, OutputCollector<LongWritable, Text> output, Reporter reporter) throws IOException { int slotId = Integer.parseInt(value.toString().trim()); html.fireRandom(slotId); long[] range = HtmlCore.getPageRange(slotId, pages, slotpages); /** For output collect */ for (long i = range[0]; i < range[1]; i++) { key.set(i); long[] linkids = html.genPureLinkIds(); for (int j = 0; j < linkids.length; j++) { String to = Long.toString(linkids[j]); Text v = new Text(to); output.collect(key, v); reporter.incrCounter(HiBench.Counters.BYTES_DATA_GENERATED, 8 + v.getLength()); } if (0 == (i % 10000)) { log.info("still running: " + (i - range[0]) + " of " + slotpages); } } }
/** * Generate random data, compress it, index and md5 hash the data. Then read it all back and md5 * that too, to verify that it all went ok. * * @param testWithIndex Should we index or not? * @param charsToOutput How many characters of random data should we output. * @throws IOException * @throws NoSuchAlgorithmException * @throws InterruptedException */ private void runTest(boolean testWithIndex, int charsToOutput) throws IOException, NoSuchAlgorithmException, InterruptedException { Configuration conf = new Configuration(); conf.setLong("fs.local.block.size", charsToOutput / 2); // reducing block size to force a split of the tiny file conf.set("io.compression.codecs", LzopCodec.class.getName()); Assume.assumeTrue(CoreTestUtil.okToRunLzoTests(conf)); FileSystem.getLocal(conf).close(); // remove cached filesystem (if any) FileSystem localFs = FileSystem.getLocal(conf); localFs.delete(outputDir_, true); localFs.mkdirs(outputDir_); Job job = new Job(conf); TextOutputFormat.setCompressOutput(job, true); TextOutputFormat.setOutputCompressorClass(job, LzopCodec.class); TextOutputFormat.setOutputPath(job, outputDir_); TaskAttemptContext attemptContext = new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID("123", 0, false, 1, 2)); // create some input data byte[] expectedMd5 = createTestInput(outputDir_, localFs, attemptContext, charsToOutput); if (testWithIndex) { Path lzoFile = new Path(outputDir_, lzoFileName_); LzoIndex.createIndex(localFs, lzoFile); } LzoTextInputFormat inputFormat = new LzoTextInputFormat(); TextInputFormat.setInputPaths(job, outputDir_); List<InputSplit> is = inputFormat.getSplits(job); // verify we have the right number of lzo chunks if (testWithIndex && OUTPUT_BIG == charsToOutput) { assertEquals(3, is.size()); } else { assertEquals(1, is.size()); } // let's read it all and calculate the md5 hash for (InputSplit inputSplit : is) { RecordReader<LongWritable, Text> rr = inputFormat.createRecordReader(inputSplit, attemptContext); rr.initialize(inputSplit, attemptContext); while (rr.nextKeyValue()) { Text value = rr.getCurrentValue(); md5_.update(value.getBytes(), 0, value.getLength()); } rr.close(); } localFs.close(); assertTrue(Arrays.equals(expectedMd5, md5_.digest())); }
private boolean inBounds(final Key k) { k.getRow(row); final MultiDimensionalCoordinates coordinates = indexStrategy.getCoordinatesPerDimension( new ByteArrayId(new GeowaveRowId(row.getBytes(), row.getLength()).getInsertionId())); return rangeCache.inBounds(coordinates); }
/** Called for every record in the data */ @Override public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { /** Skip enormous documents, due to memory problems and since regex cannot handle them. */ if (value.getLength() > MAX_DOC_SIZE_IN_BYTES) { context.getCounter(ProcessingTime.SKIPPED).increment(1); return; } /** Parse document and measure time */ t1 = System.nanoTime(); Spinn3rDocument d = new Spinn3rDocument(value.toString()); t2 = System.nanoTime(); context.getCounter(ProcessingTime.PARSING).increment(t2 - t1); /** Return only those documents that satisfy search conditions */ t1 = System.nanoTime(); t = filter.documentSatisfies(d); t2 = System.nanoTime(); context.getCounter(ProcessingTime.FILTERING).increment(t2 - t1); /** Output if satisfies */ if (t) { if (cmdMap.hasOption("formatF5")) { context.write(new Text(d.toStringF5()), NullWritable.get()); } else { context.write(new Text(d.toString()), NullWritable.get()); } } }
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { HashMap<String, Integer> aggResult = new HashMap<String, Integer>(); newValue.clear(); for (Text val : values) { String[] fields = val.toString().split(":"); int num = Integer.parseInt(fields[0]); String dim = fields[1]; if (aggResult.containsKey(dim)) { aggResult.put(dim, aggResult.get(dim).intValue() + num); } else { aggResult.put(dim, num); } } for (String hashKey : aggResult.keySet()) { String singleValue = hashKey + "," + aggResult.get(hashKey).toString(); String tab = "\t"; if (newValue.getLength() > 0) newValue.append(tab.getBytes(), 0, tab.length()); newValue.append(singleValue.getBytes(), 0, singleValue.length()); } context.write(key, newValue); }
/* Set the output string entry i to the contents of Text object t. * If t is a null object reference, record that the value is a SQL NULL. */ private static void setString(BytesColumnVector outV, int i, Text t) { if (t == null) { outV.noNulls = false; outV.isNull[i] = true; return; } outV.setVal(i, t.getBytes(), 0, t.getLength()); }
/** * Write the object to the byte stream, handling Text as a special case. * * @param o the object to print * @throws IOException if the write throws, we pass it on */ private void writeObject(Object o) throws IOException { if (o instanceof Text) { Text to = (Text) o; out.write(to.getBytes(), 0, to.getLength()); } else { out.write(o.toString().getBytes(utf8)); } }
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { int i = value.getLength(); context.write(new Text("bytes"), new IntWritable(i)); context.write(new Text("lines"), new IntWritable(1)); }
@Override public void map(LongWritable key, Text value, Context output) throws IOException, InterruptedException { // split on tab int index = -1; for (int i = 0; i < value.getLength(); i++) { if (value.getBytes()[i] == '\t') { index = i; break; } } if (index > 0) { outputKey.set(value.getBytes(), 0, index); outputValue.set(value.getBytes(), index + 1, value.getLength() - (index + 1)); output.write(outputKey, outputValue); } }
@Override public void emit(StringOption option) throws IOException { startCell(); if (emitNull(option)) { return; } Text text = option.get(); if (text.getLength() == 0) { return; } byte[] bytes = text.getBytes(); ByteBuffer source = ByteBuffer.wrap(bytes, 0, text.getLength()); decoder.reset(); decodeBuffer.clear(); while (true) { CoderResult result = decoder.decode(source, decodeBuffer, true); if (result.isError()) { throw new RecordFormatException( MessageFormat.format("Cannot process a character string (\"{0}\")", result)); } if (result.isUnderflow()) { consumeDecoded(); break; } if (result.isOverflow()) { consumeDecoded(); } } while (true) { CoderResult result = decoder.flush(decodeBuffer); if (result.isError()) { throw new RecordFormatException( MessageFormat.format("Cannot process a character string (\"{0}\")", result)); } if (result.isUnderflow()) { consumeDecoded(); break; } if (result.isOverflow()) { consumeDecoded(); } } }
public boolean jsonToDocument(Text line) { if (line.getLength() == 0) { return false; } try { doc = DocumentFabric.fromJson(line.toString()); } catch (Exception e) { LOG.error(e.getMessage()); return false; } return true; }
/** * Creates an lzo file with random data. * * @param outputDir Output directory. * @param fs File system we're using. * @param attemptContext Task attempt context, contains task id etc. * @throws IOException * @throws InterruptedException */ private byte[] createTestInput( Path outputDir, FileSystem fs, TaskAttemptContext attemptContext, int charsToOutput) throws IOException, InterruptedException { TextOutputFormat<Text, Text> output = new TextOutputFormat<Text, Text>(); RecordWriter<Text, Text> rw = null; md5_.reset(); try { rw = output.getRecordWriter(attemptContext); char[] chars = "abcdefghijklmnopqrstuvwxyz\u00E5\u00E4\u00F6".toCharArray(); Random r = new Random(System.currentTimeMillis()); Text key = new Text(); Text value = new Text(); int charsMax = chars.length - 1; for (int i = 0; i < charsToOutput; ) { i += fillText(chars, r, charsMax, key); i += fillText(chars, r, charsMax, value); rw.write(key, value); md5_.update(key.getBytes(), 0, key.getLength()); // text output format writes tab between the key and value md5_.update("\t".getBytes("UTF-8")); md5_.update(value.getBytes(), 0, value.getLength()); } } finally { if (rw != null) { rw.close(attemptContext); OutputCommitter committer = output.getOutputCommitter(attemptContext); committer.commitTask(attemptContext); committer.cleanupJob(attemptContext); } } byte[] result = md5_.digest(); md5_.reset(); return result; }
/** Convert String to SHA-1 */ public Text evaluate(Text n) { if (n == null) { return null; } digest.reset(); digest.update(n.getBytes(), 0, n.getLength()); byte[] shaBytes = digest.digest(); String shaHex = Hex.encodeHexString(shaBytes); result.set(shaHex); return result; }
public IntWritable evaluate(Text s) { if (s == null) { return null; } if (s.getLength() > 0) { result.set(s.getBytes()[0]); } else { result.set(0); } return result; }
@Override public void map( LongWritable key, Text value, OutputCollector<NullWritable, NullWritable> collector, Reporter reporter) throws IOException { if (value.getLength() == 0) return; byte[] raw = value.getBytes(); Map<String, Object> msg = mapper.readValue(raw, Map.class); String rowId = createRowId(msg); // System.out.println("rowId:" + rowId.toString()); if (rowId == null) { // TODO ... Error Handler return; } if (mb == null) { mb = ks.prepareMutationBatch(); } ColumnListMutation<String> c = mb.withRow(cf, rowId); c.putColumn("raw", value.toString(), null); if (storeAttirbute) { for (String k : msg.keySet()) { if (k.startsWith("__")) continue; Object v = msg.get(k); if (v == null) continue; if (v.equals("")) continue; c.putColumn(k.toLowerCase(), v.toString(), null); } } try { if (mb.getRowCount() > 300) { OperationResult<Void> result = mb.execute(); mb = null; } } catch (ConnectionException e) { e.printStackTrace(); // To change body of catch statement use File | Settings | File // Templates. mb = null; } }
@Override public void map( LongWritable key, Text value, OutputCollector<LongWritable, Text> output, Reporter reporter) throws IOException { int slotId = Integer.parseInt(value.toString().trim()); long[] range = HtmlCore.getPageRange(slotId, pages, slotpages); for (long i = range[0]; i < range[1]; i++) { key.set(i); Text v = new Text(Long.toString(i)); output.collect(key, v); reporter.incrCounter(HiBench.Counters.BYTES_DATA_GENERATED, 8 + v.getLength()); } }
/* * Scans the text line to find the position and the lengths of the fields * within it. The positions and lengths are saved into the instance arrays * 'fieldPositions' and 'fieldLengths'. * * @exception FormatException Line doesn't have the expected number of fields. */ private void setFieldPositionsAndLengths(Text line) { int pos = 0; // the byte position within the record int fieldno = 0; // the field index within the record while (pos < line.getLength() && fieldno < NUM_QSEQ_COLS) // iterate over each field { int endpos = line.find(Delim, pos); // the field's end position if (endpos < 0) endpos = line.getLength(); fieldPositions[fieldno] = pos; fieldLengths[fieldno] = endpos - pos; pos = endpos + 1; // the next starting position is the current end + 1 fieldno += 1; } if (fieldno != NUM_QSEQ_COLS) throw new FormatException( "found " + fieldno + " fields instead of 11 at " + makePositionMessage(this.pos - line.getLength()) + ". Line: " + line); }
/** * Convert from string to an integer. This is called for CAST(... AS INT) * * @param i The string value to convert * @return IntWritable */ public IntWritable evaluate(Text i) { if (i == null) { return null; } else { try { intWritable.set(LazyInteger.parseInt(i.getBytes(), 0, i.getLength(), 10)); return intWritable; } catch (NumberFormatException e) { // MySQL returns 0 if the string is not a well-formed numeric value. // return IntWritable.valueOf(0); // But we decided to return NULL instead, which is more conservative. return null; } } }
@Override public void reduce(Text key, Iterable<Text> values, Context output) throws IOException, InterruptedException { // be careful with the timestamp... if you run on a cluster // where the time is whacked you may not see your updates in // accumulo if there is already an existing value with a later // timestamp in accumulo... so make sure ntp is running on the // cluster or consider using logical time... one options is // to let accumulo set the time long timestamp = System.currentTimeMillis(); int index = 0; for (Text value : values) { Key outputKey = new Key(key, new Text("colf"), new Text(String.format("col_%07d", index)), timestamp); index++; Value outputValue = new Value(value.getBytes(), 0, value.getLength()); output.write(outputKey, outputValue); } }