Ejemplo n.º 1
0
    @Override
    public void reduce(
        PairOfInts docnoPair,
        Iterator<PairOfIntString> titles,
        OutputCollector<Text, Text> output,
        Reporter reporter)
        throws IOException {
      eTitle.clear();
      fTitle.clear();
      sLogger.info(docnoPair);

      int cnt = 0;
      while (titles.hasNext()) {
        PairOfIntString title = titles.next();
        sLogger.info(title);
        if (title.getLeftElement() == CLIRUtils.E) {
          eTitle.set(title.getRightElement());
          cnt++;
        } else if (title.getLeftElement() == CLIRUtils.F) {
          fTitle.set(title.getRightElement());
          cnt++;
        } else {
          throw new RuntimeException("Unknown language ID: " + title.getLeftElement());
        }
      }

      if (cnt == 2) {
        output.collect(fTitle, eTitle);
      } else {
        sLogger.info("Incomplete data for " + docnoPair + ":" + fTitle + "," + eTitle);
      }
    }
  private int readNext(Text text, int maxLineLength, int maxBytesToConsume) throws IOException {

    int offset = 0;
    text.clear();
    Text tmp = new Text();

    for (int i = 0; i < maxBytesToConsume; i++) {

      int offsetTmp = in.readLine(tmp, maxLineLength, maxBytesToConsume);
      offset += offsetTmp;
      Matcher m = delimiterPattern.matcher(tmp.toString());

      // End of File
      if (offsetTmp == 0) {
        break;
      }

      if (m.matches()) {
        break;
      } else {
        // Append value to record
        text.append(EOL.getBytes(), 0, EOL.getLength());
        text.append(tmp.getBytes(), 0, tmp.getLength());
      }
    }

    return offset;
  }
Ejemplo n.º 3
0
    @Override
    public void map(LongWritable row, NullWritable ignored, Context context)
        throws IOException, InterruptedException {
      context.setStatus("Entering");
      long rowId = row.get();
      if (rand == null) {
        // we use 3 random numbers per a row
        rand = new RandomGenerator(rowId * 3);
      }
      addKey();
      value.clear();
      // addRowId(rowId);
      addFiller(rowId);

      // New
      Mutation m = new Mutation(key);
      m.put(
          new Text("c"), // column family
          getRowIdString(rowId), // column qual
          new Value(value.toString().getBytes())); // data

      context.setStatus("About to add to accumulo");
      context.write(tableName, m);
      context.setStatus("Added to accumulo " + key.toString());
    }
Ejemplo n.º 4
0
    public synchronized boolean next(LongWritable key, Text value) throws IOException {
      boolean gotsomething;
      boolean retval;
      byte space[] = {' '};
      int counter = 0;
      String ln = null;
      value.clear();
      gotsomething = false;

      do {
        retval = lineRecord.next(lineKey, lineValue);
        if (retval) {
          if (lineValue.toString().length() > 0) {
            ln = lineValue.toString();
            lineValue.set(
                ln.split("	")[
                    0]); // here we basically get the first element from a KV such as '4847570 -1'
            byte[] rawline = lineValue.getBytes();
            int rawlinelen = lineValue.getLength();
            value.append(rawline, 0, rawlinelen);
            value.append(space, 0, 1);
            counter++;
          }
          gotsomething = true;
        } else {
          break;
        }
      } while (counter < MAX_LINE_COUNT);

      // System.out.println("ParagraphRecordReader::next() returns "+gotsomething+" after setting
      // value to: ["+value.toString()+"]");
      return gotsomething;
    }
Ejemplo n.º 5
0
    public synchronized boolean next(LongWritable key, Text value) throws IOException {
      boolean gotsomething;
      boolean retval;
      byte space[] = {' '};
      int counter = 0;
      value.clear();
      gotsomething = false;

      do {
        retval = lineRecord.next(lineKey, lineValue);
        if (retval) {
          if (lineValue.toString().length() > 0) {
            byte[] rawline = lineValue.getBytes();
            int rawlinelen = lineValue.getLength();
            value.append(rawline, 0, rawlinelen);
            value.append(space, 0, 1);
            counter++;
          }
          gotsomething = true;
        } else {
          break;
        }
      } while (counter < MAX_LINE_COUNT);

      return gotsomething;
    }
Ejemplo n.º 6
0
    public void reduce(Text key, Iterable<Text> values, Context context)
        throws IOException, InterruptedException {
      HashMap<String, Integer> aggResult = new HashMap<String, Integer>();

      newValue.clear();
      for (Text val : values) {
        String[] fields = val.toString().split(":");
        int num = Integer.parseInt(fields[0]);
        String dim = fields[1];

        if (aggResult.containsKey(dim)) {
          aggResult.put(dim, aggResult.get(dim).intValue() + num);
        } else {
          aggResult.put(dim, num);
        }
      }

      for (String hashKey : aggResult.keySet()) {
        String singleValue = hashKey + "," + aggResult.get(hashKey).toString();
        String tab = "\t";
        if (newValue.getLength() > 0) newValue.append(tab.getBytes(), 0, tab.length());
        newValue.append(singleValue.getBytes(), 0, singleValue.length());
      }

      context.write(key, newValue);
    }
Ejemplo n.º 7
0
 private int skipUtfByteOrderMark() throws IOException {
   // Strip BOM(Byte Order Mark)
   // Text only support UTF-8, we only need to check UTF-8 BOM
   // (0xEF,0xBB,0xBF) at the start of the text stream.
   int newMaxLineLength = (int) Math.min(3L + (long) maxLineLength, Integer.MAX_VALUE);
   int newSize = in.readLine(value, newMaxLineLength, maxBytesToConsume(pos));
   // Even we read 3 extra bytes for the first line,
   // we won't alter existing behavior (no backwards incompat issue).
   // Because the newSize is less than maxLineLength and
   // the number of bytes copied to Text is always no more than newSize.
   // If the return size from readLine is not less than maxLineLength,
   // we will discard the current line and read the next line.
   pos += newSize;
   int textLength = value.getLength();
   byte[] textBytes = value.getBytes();
   if ((textLength >= 3)
       && (textBytes[0] == (byte) 0xEF)
       && (textBytes[1] == (byte) 0xBB)
       && (textBytes[2] == (byte) 0xBF)) {
     // find UTF-8 BOM, strip it.
     LOG.info("Found UTF-8 BOM and skipped it");
     textLength -= 3;
     newSize -= 3;
     if (textLength > 0) {
       // It may work to use the same buffer and not do the copyBytes
       textBytes = value.copyBytes();
       value.set(textBytes, 3, textLength);
     } else {
       value.clear();
     }
   }
   return newSize;
 }
Ejemplo n.º 8
0
    private void scanQseqLine(Text line, Text key, SequencedFragment fragment) {
      setFieldPositionsAndLengths(line);

      // Build the key.  We concatenate all fields from 0 to 5 (machine to y-pos)
      // and then the read number, replacing the tabs with colons.
      key.clear();
      // append up and including field[5]
      key.append(line.getBytes(), 0, fieldPositions[5] + fieldLengths[5]);
      // replace tabs with :
      byte[] bytes = key.getBytes();
      int temporaryEnd = key.getLength();
      for (int i = 0; i < temporaryEnd; ++i) if (bytes[i] == '\t') bytes[i] = ':';
      // append the read number
      key.append(
          line.getBytes(),
          fieldPositions[7] - 1,
          fieldLengths[7] + 1); // +/- 1 to catch the preceding tab.
      // convert the tab preceding the read number into a :
      key.getBytes()[temporaryEnd] = ':';

      // now the fragment
      try {
        fragment.clear();
        fragment.setInstrument(Text.decode(line.getBytes(), fieldPositions[0], fieldLengths[0]));
        fragment.setRunNumber(
            Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[1], fieldLengths[1])));
        // fragment.setFlowcellId();
        fragment.setLane(
            Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[2], fieldLengths[2])));
        fragment.setTile(
            Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[3], fieldLengths[3])));
        fragment.setXpos(
            Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[4], fieldLengths[4])));
        fragment.setYpos(
            Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[5], fieldLengths[5])));
        fragment.setRead(
            Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[7], fieldLengths[7])));
        fragment.setFilterPassed(line.getBytes()[fieldPositions[10]] != '0');
        // fragment.setControlNumber();
        if (fieldLengths[6] > 0
            && line.getBytes()[fieldPositions[6]] == '0') // 0 is a null index sequence
        fragment.setIndexSequence(null);
        else
          fragment.setIndexSequence(
              Text.decode(line.getBytes(), fieldPositions[6], fieldLengths[6]).replace('.', 'N'));
      } catch (CharacterCodingException e) {
        throw new FormatException(
            "Invalid character format at "
                + makePositionMessage(this.pos - line.getLength())
                + "; line: "
                + line);
      }

      fragment.getSequence().append(line.getBytes(), fieldPositions[8], fieldLengths[8]);
      fragment.getQuality().append(line.getBytes(), fieldPositions[9], fieldLengths[9]);
    }
 public boolean next(Text key, Text value) throws IOException {
   if (in.next(junk, line)) {
     if (line.getLength() < KEY_LENGTH) {
       key.set(line);
       value.clear();
     } else {
       byte[] bytes = line.getBytes();
       key.set(bytes, 0, KEY_LENGTH);
       value.set(bytes, KEY_LENGTH, line.getLength() - KEY_LENGTH);
     }
     return true;
   } else {
     return false;
   }
 }
Ejemplo n.º 10
0
 public Text evaluate(Text srcURL, Text enc) {
   if (srcURL == null) return null;
   if (enc == null) enc = new Text("UTF8");
   String srcURLString = srcURL.toString();
   String encString = enc.toString();
   dstURL.clear();
   if (encString.toLowerCase().equals("jsescape") || encString.toLowerCase().equals("js_escape")) {
     dstURL.set(Escape.unescape(srcURLString));
     return dstURL;
   }
   try {
     dstURL.set(URLDecoder.decode(srcURLString, encString));
   } catch (Exception e) {
     dstURL.set(srcURL);
   }
   return dstURL;
 }
  /*
   * Finds a full file and sets it as the value.
   */
  public synchronized boolean next(LongWritable key, Text value) throws IOException {
    Text line = new Text();
    boolean retrieved = true;

    String result = "";

    value.clear();

    while (retrieved) {
      retrieved = recordReader.next(key, line);

      if (line.toString().length() > 0) {
        String lineValue = line.toString();
        result += lineValue + "\n";
      }
    }

    value.set(result);
    return true;
  }
  /* Finds a full sentence and sets it as the value.
   * If the sentence is shorter than the full line, the rest is stored to use later.
   */
  public synchronized boolean next(LongWritable key, Text value) throws IOException {
    Text line = new Text();
    boolean getMore = true;
    boolean retrieved = false;

    String result = leftovers;
    leftovers = "";

    value.clear();

    while (getMore) {
      retrieved = recordReader.next(key, line);

      if (retrieved) {
        String lineValue = line.toString();

        // here, we assume sentences run until the period.
        int endOfSentence = lineValue.indexOf('.');

        if (endOfSentence == -1) {
          result += " " + lineValue;
        } else {
          result += " " + lineValue.substring(0, endOfSentence + 1);
          leftovers = lineValue.substring(endOfSentence + 1);
          getMore = false;
        }
      } else {
        getMore = false;
        value.set(result);
        return false;
      }
    }

    value.set(result);
    return true;
  }
Ejemplo n.º 13
0
    private void parseMetaData() throws IOException {
      Text line = new Text();
      long read;
      FSDataInputStream in = null;
      LineReader lin = null;

      try {
        in = fs.open(masterIndexPath);
        FileStatus masterStat = fs.getFileStatus(masterIndexPath);
        masterIndexTimestamp = masterStat.getModificationTime();
        lin = new LineReader(in, getConf());
        read = lin.readLine(line);

        // the first line contains the version of the index file
        String versionLine = line.toString();
        String[] arr = versionLine.split(" ");
        version = Integer.parseInt(arr[0]);
        // make it always backwards-compatible
        if (this.version > HarFileSystem.VERSION) {
          throw new IOException(
              "Invalid version " + this.version + " expected " + HarFileSystem.VERSION);
        }

        // each line contains a hashcode range and the index file name
        String[] readStr;
        while (read < masterStat.getLen()) {
          int b = lin.readLine(line);
          read += b;
          readStr = line.toString().split(" ");
          int startHash = Integer.parseInt(readStr[0]);
          int endHash = Integer.parseInt(readStr[1]);
          stores.add(
              new Store(
                  Long.parseLong(readStr[2]), Long.parseLong(readStr[3]), startHash, endHash));
          line.clear();
        }
      } catch (IOException ioe) {
        LOG.warn("Encountered exception ", ioe);
        throw ioe;
      } finally {
        IOUtils.cleanup(LOG, lin, in);
      }

      FSDataInputStream aIn = fs.open(archiveIndexPath);
      try {
        FileStatus archiveStat = fs.getFileStatus(archiveIndexPath);
        archiveIndexTimestamp = archiveStat.getModificationTime();
        LineReader aLin;

        // now start reading the real index file
        for (Store s : stores) {
          read = 0;
          aIn.seek(s.begin);
          aLin = new LineReader(aIn, getConf());
          while (read + s.begin < s.end) {
            int tmp = aLin.readLine(line);
            read += tmp;
            String lineFeed = line.toString();
            String[] parsed = lineFeed.split(" ");
            parsed[0] = decodeFileName(parsed[0]);
            archive.put(new Path(parsed[0]), new HarStatus(lineFeed));
            line.clear();
          }
        }
      } finally {
        IOUtils.cleanup(LOG, aIn);
      }
    }