@Override public void reduce( PairOfInts docnoPair, Iterator<PairOfIntString> titles, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { eTitle.clear(); fTitle.clear(); sLogger.info(docnoPair); int cnt = 0; while (titles.hasNext()) { PairOfIntString title = titles.next(); sLogger.info(title); if (title.getLeftElement() == CLIRUtils.E) { eTitle.set(title.getRightElement()); cnt++; } else if (title.getLeftElement() == CLIRUtils.F) { fTitle.set(title.getRightElement()); cnt++; } else { throw new RuntimeException("Unknown language ID: " + title.getLeftElement()); } } if (cnt == 2) { output.collect(fTitle, eTitle); } else { sLogger.info("Incomplete data for " + docnoPair + ":" + fTitle + "," + eTitle); } }
private int readNext(Text text, int maxLineLength, int maxBytesToConsume) throws IOException { int offset = 0; text.clear(); Text tmp = new Text(); for (int i = 0; i < maxBytesToConsume; i++) { int offsetTmp = in.readLine(tmp, maxLineLength, maxBytesToConsume); offset += offsetTmp; Matcher m = delimiterPattern.matcher(tmp.toString()); // End of File if (offsetTmp == 0) { break; } if (m.matches()) { break; } else { // Append value to record text.append(EOL.getBytes(), 0, EOL.getLength()); text.append(tmp.getBytes(), 0, tmp.getLength()); } } return offset; }
@Override public void map(LongWritable row, NullWritable ignored, Context context) throws IOException, InterruptedException { context.setStatus("Entering"); long rowId = row.get(); if (rand == null) { // we use 3 random numbers per a row rand = new RandomGenerator(rowId * 3); } addKey(); value.clear(); // addRowId(rowId); addFiller(rowId); // New Mutation m = new Mutation(key); m.put( new Text("c"), // column family getRowIdString(rowId), // column qual new Value(value.toString().getBytes())); // data context.setStatus("About to add to accumulo"); context.write(tableName, m); context.setStatus("Added to accumulo " + key.toString()); }
public synchronized boolean next(LongWritable key, Text value) throws IOException { boolean gotsomething; boolean retval; byte space[] = {' '}; int counter = 0; String ln = null; value.clear(); gotsomething = false; do { retval = lineRecord.next(lineKey, lineValue); if (retval) { if (lineValue.toString().length() > 0) { ln = lineValue.toString(); lineValue.set( ln.split(" ")[ 0]); // here we basically get the first element from a KV such as '4847570 -1' byte[] rawline = lineValue.getBytes(); int rawlinelen = lineValue.getLength(); value.append(rawline, 0, rawlinelen); value.append(space, 0, 1); counter++; } gotsomething = true; } else { break; } } while (counter < MAX_LINE_COUNT); // System.out.println("ParagraphRecordReader::next() returns "+gotsomething+" after setting // value to: ["+value.toString()+"]"); return gotsomething; }
public synchronized boolean next(LongWritable key, Text value) throws IOException { boolean gotsomething; boolean retval; byte space[] = {' '}; int counter = 0; value.clear(); gotsomething = false; do { retval = lineRecord.next(lineKey, lineValue); if (retval) { if (lineValue.toString().length() > 0) { byte[] rawline = lineValue.getBytes(); int rawlinelen = lineValue.getLength(); value.append(rawline, 0, rawlinelen); value.append(space, 0, 1); counter++; } gotsomething = true; } else { break; } } while (counter < MAX_LINE_COUNT); return gotsomething; }
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { HashMap<String, Integer> aggResult = new HashMap<String, Integer>(); newValue.clear(); for (Text val : values) { String[] fields = val.toString().split(":"); int num = Integer.parseInt(fields[0]); String dim = fields[1]; if (aggResult.containsKey(dim)) { aggResult.put(dim, aggResult.get(dim).intValue() + num); } else { aggResult.put(dim, num); } } for (String hashKey : aggResult.keySet()) { String singleValue = hashKey + "," + aggResult.get(hashKey).toString(); String tab = "\t"; if (newValue.getLength() > 0) newValue.append(tab.getBytes(), 0, tab.length()); newValue.append(singleValue.getBytes(), 0, singleValue.length()); } context.write(key, newValue); }
private int skipUtfByteOrderMark() throws IOException { // Strip BOM(Byte Order Mark) // Text only support UTF-8, we only need to check UTF-8 BOM // (0xEF,0xBB,0xBF) at the start of the text stream. int newMaxLineLength = (int) Math.min(3L + (long) maxLineLength, Integer.MAX_VALUE); int newSize = in.readLine(value, newMaxLineLength, maxBytesToConsume(pos)); // Even we read 3 extra bytes for the first line, // we won't alter existing behavior (no backwards incompat issue). // Because the newSize is less than maxLineLength and // the number of bytes copied to Text is always no more than newSize. // If the return size from readLine is not less than maxLineLength, // we will discard the current line and read the next line. pos += newSize; int textLength = value.getLength(); byte[] textBytes = value.getBytes(); if ((textLength >= 3) && (textBytes[0] == (byte) 0xEF) && (textBytes[1] == (byte) 0xBB) && (textBytes[2] == (byte) 0xBF)) { // find UTF-8 BOM, strip it. LOG.info("Found UTF-8 BOM and skipped it"); textLength -= 3; newSize -= 3; if (textLength > 0) { // It may work to use the same buffer and not do the copyBytes textBytes = value.copyBytes(); value.set(textBytes, 3, textLength); } else { value.clear(); } } return newSize; }
private void scanQseqLine(Text line, Text key, SequencedFragment fragment) { setFieldPositionsAndLengths(line); // Build the key. We concatenate all fields from 0 to 5 (machine to y-pos) // and then the read number, replacing the tabs with colons. key.clear(); // append up and including field[5] key.append(line.getBytes(), 0, fieldPositions[5] + fieldLengths[5]); // replace tabs with : byte[] bytes = key.getBytes(); int temporaryEnd = key.getLength(); for (int i = 0; i < temporaryEnd; ++i) if (bytes[i] == '\t') bytes[i] = ':'; // append the read number key.append( line.getBytes(), fieldPositions[7] - 1, fieldLengths[7] + 1); // +/- 1 to catch the preceding tab. // convert the tab preceding the read number into a : key.getBytes()[temporaryEnd] = ':'; // now the fragment try { fragment.clear(); fragment.setInstrument(Text.decode(line.getBytes(), fieldPositions[0], fieldLengths[0])); fragment.setRunNumber( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[1], fieldLengths[1]))); // fragment.setFlowcellId(); fragment.setLane( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[2], fieldLengths[2]))); fragment.setTile( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[3], fieldLengths[3]))); fragment.setXpos( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[4], fieldLengths[4]))); fragment.setYpos( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[5], fieldLengths[5]))); fragment.setRead( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[7], fieldLengths[7]))); fragment.setFilterPassed(line.getBytes()[fieldPositions[10]] != '0'); // fragment.setControlNumber(); if (fieldLengths[6] > 0 && line.getBytes()[fieldPositions[6]] == '0') // 0 is a null index sequence fragment.setIndexSequence(null); else fragment.setIndexSequence( Text.decode(line.getBytes(), fieldPositions[6], fieldLengths[6]).replace('.', 'N')); } catch (CharacterCodingException e) { throw new FormatException( "Invalid character format at " + makePositionMessage(this.pos - line.getLength()) + "; line: " + line); } fragment.getSequence().append(line.getBytes(), fieldPositions[8], fieldLengths[8]); fragment.getQuality().append(line.getBytes(), fieldPositions[9], fieldLengths[9]); }
public boolean next(Text key, Text value) throws IOException { if (in.next(junk, line)) { if (line.getLength() < KEY_LENGTH) { key.set(line); value.clear(); } else { byte[] bytes = line.getBytes(); key.set(bytes, 0, KEY_LENGTH); value.set(bytes, KEY_LENGTH, line.getLength() - KEY_LENGTH); } return true; } else { return false; } }
public Text evaluate(Text srcURL, Text enc) { if (srcURL == null) return null; if (enc == null) enc = new Text("UTF8"); String srcURLString = srcURL.toString(); String encString = enc.toString(); dstURL.clear(); if (encString.toLowerCase().equals("jsescape") || encString.toLowerCase().equals("js_escape")) { dstURL.set(Escape.unescape(srcURLString)); return dstURL; } try { dstURL.set(URLDecoder.decode(srcURLString, encString)); } catch (Exception e) { dstURL.set(srcURL); } return dstURL; }
/* * Finds a full file and sets it as the value. */ public synchronized boolean next(LongWritable key, Text value) throws IOException { Text line = new Text(); boolean retrieved = true; String result = ""; value.clear(); while (retrieved) { retrieved = recordReader.next(key, line); if (line.toString().length() > 0) { String lineValue = line.toString(); result += lineValue + "\n"; } } value.set(result); return true; }
/* Finds a full sentence and sets it as the value. * If the sentence is shorter than the full line, the rest is stored to use later. */ public synchronized boolean next(LongWritable key, Text value) throws IOException { Text line = new Text(); boolean getMore = true; boolean retrieved = false; String result = leftovers; leftovers = ""; value.clear(); while (getMore) { retrieved = recordReader.next(key, line); if (retrieved) { String lineValue = line.toString(); // here, we assume sentences run until the period. int endOfSentence = lineValue.indexOf('.'); if (endOfSentence == -1) { result += " " + lineValue; } else { result += " " + lineValue.substring(0, endOfSentence + 1); leftovers = lineValue.substring(endOfSentence + 1); getMore = false; } } else { getMore = false; value.set(result); return false; } } value.set(result); return true; }
private void parseMetaData() throws IOException { Text line = new Text(); long read; FSDataInputStream in = null; LineReader lin = null; try { in = fs.open(masterIndexPath); FileStatus masterStat = fs.getFileStatus(masterIndexPath); masterIndexTimestamp = masterStat.getModificationTime(); lin = new LineReader(in, getConf()); read = lin.readLine(line); // the first line contains the version of the index file String versionLine = line.toString(); String[] arr = versionLine.split(" "); version = Integer.parseInt(arr[0]); // make it always backwards-compatible if (this.version > HarFileSystem.VERSION) { throw new IOException( "Invalid version " + this.version + " expected " + HarFileSystem.VERSION); } // each line contains a hashcode range and the index file name String[] readStr; while (read < masterStat.getLen()) { int b = lin.readLine(line); read += b; readStr = line.toString().split(" "); int startHash = Integer.parseInt(readStr[0]); int endHash = Integer.parseInt(readStr[1]); stores.add( new Store( Long.parseLong(readStr[2]), Long.parseLong(readStr[3]), startHash, endHash)); line.clear(); } } catch (IOException ioe) { LOG.warn("Encountered exception ", ioe); throw ioe; } finally { IOUtils.cleanup(LOG, lin, in); } FSDataInputStream aIn = fs.open(archiveIndexPath); try { FileStatus archiveStat = fs.getFileStatus(archiveIndexPath); archiveIndexTimestamp = archiveStat.getModificationTime(); LineReader aLin; // now start reading the real index file for (Store s : stores) { read = 0; aIn.seek(s.begin); aLin = new LineReader(aIn, getConf()); while (read + s.begin < s.end) { int tmp = aLin.readLine(line); read += tmp; String lineFeed = line.toString(); String[] parsed = lineFeed.split(" "); parsed[0] = decodeFileName(parsed[0]); archive.put(new Path(parsed[0]), new HarStatus(lineFeed)); line.clear(); } } } finally { IOUtils.cleanup(LOG, aIn); } }