示例#1
0
 public void map(LongWritable key, Text value, Context context)
     throws IOException, InterruptedException {
   int start = value.find("<title>");
   int end = value.find("</title>", start);
   if (start == -1 || end == -1) return;
   start += 7;
   String title = Text.decode(value.getBytes(), start, end - start);
   title = title.replace(' ', '_');
   Text titleKey = new Text(title);
   String outLinks = "";
   start = value.find("<text");
   if (start == -1) {
     context.write(titleKey, new Text(outLinks));
     return;
   }
   start = value.find(">", start);
   if (start == -1) {
     context.write(titleKey, new Text(outLinks));
     return;
   }
   end = value.find("</text>");
   if (end == -1) {
     context.write(titleKey, new Text(outLinks));
     return;
   }
   start += 1;
   String text = Text.decode(value.getBytes(), start, end - start);
   Matcher wikiLinksMatcher = patterLinks.matcher(text);
   LinkedList<String> duplicateRemover = new LinkedList<String>();
   while (wikiLinksMatcher.find()) {
     String outLinkPage = wikiLinksMatcher.group();
     outLinkPage = linksCatcher(outLinkPage);
     if (outLinkPage != null) {
       if (!outLinkPage.isEmpty()) {
         outLinkPage = outLinkPage.trim();
         duplicateRemover.add(outLinkPage);
       }
     }
   }
   LinkedHashSet<String> duplicatePruning = new LinkedHashSet<String>(duplicateRemover);
   LinkedList<String> finalList = new LinkedList<String>(duplicatePruning);
   boolean first = true;
   for (String values : finalList) {
     if (!values.equals(title)) {
       if (!first) outLinks += "\t";
       outLinks += values;
       first = false;
     }
   }
   context.write(titleKey, new Text(outLinks));
 }
示例#2
0
  @Override
  public boolean nextKeyValue() throws IOException, CharacterCodingException {
    if (!lineRR.nextKeyValue()) return false;

    Text line = getCurrentValue();
    int tabOne = line.find("\t");

    int rid = Integer.parseInt(Text.decode(line.getBytes(), 0, tabOne));

    int tabTwo = line.find("\t", tabOne + 1);
    int posBeg = tabOne + 1;
    int posEnd = tabTwo - 1;

    int pos = Integer.parseInt(Text.decode(line.getBytes(), posBeg, posEnd - posBeg + 1));

    key.set(BAMRecordReader.getKey0(rid, pos));
    return true;
  }
示例#3
0
 @Override
 public synchronized List<String> getKeys() throws IOException {
   List<String> list = new ArrayList<String>();
   List<Text> keys = credentials.getAllSecretKeys();
   for (Text key : keys) {
     if (key.find("@") == -1) {
       list.add(key.toString());
     }
   }
   return list;
 }
  private String[] getTitleAndText(Text value) throws CharacterCodingException {
    String[] titleAndText = new String[2];

    int start = value.find("<title>");
    int end = value.find("</title>", start);
    start += 7; // add <title> length.

    titleAndText[0] = Text.decode(value.getBytes(), start, end - start);

    start = value.find("<text");
    start = value.find(">", start);
    end = value.find("</text>", start);
    start += 1;

    if (start == -1 || end == -1) {
      return new String[] {"", ""};
    }

    titleAndText[1] = Text.decode(value.getBytes(), start, end - start);

    return titleAndText;
  }
    /*
     * Scans the text line to find the position and the lengths of the fields
     * within it. The positions and lengths are saved into the instance arrays
     * 'fieldPositions' and 'fieldLengths'.
     *
     * @exception FormatException Line doesn't have the expected number of fields.
     */
    private void setFieldPositionsAndLengths(Text line) {
      int pos = 0; // the byte position within the record
      int fieldno = 0; // the field index within the record
      while (pos < line.getLength() && fieldno < NUM_QSEQ_COLS) // iterate over each field
      {
        int endpos = line.find(Delim, pos); // the field's end position
        if (endpos < 0) endpos = line.getLength();

        fieldPositions[fieldno] = pos;
        fieldLengths[fieldno] = endpos - pos;

        pos = endpos + 1; // the next starting position is the current end + 1
        fieldno += 1;
      }

      if (fieldno != NUM_QSEQ_COLS)
        throw new FormatException(
            "found "
                + fieldno
                + " fields instead of 11 at "
                + makePositionMessage(this.pos - line.getLength())
                + ". Line: "
                + line);
    }
示例#6
0
    public void map(Object key, Text value, Context context)
        throws IOException, InterruptedException {
      int start = 0;
      while (start < value.getLength()) {
        // Find the start of a node
        start = value.find("node", start);

        if (start == -1) break;

        // Find the id of that node
        start = value.find(" id", start);
        if (start == -1) break;
        start += 5;
        int c = ':';
        StringBuilder temp = new StringBuilder();
        for (; start < value.getLength(); start++) {
          c = value.charAt(start);
          if (c != '"') temp.append(c);
          else break;
        }
        if (start > value.getLength()) break;
        try {
          id.set(Long.parseLong(temp.toString()));
        } catch (NumberFormatException e) {
          // My Job isn't to worry about java not having unsigned values because they are stupid
          id.set(404);
        }

        start = value.find(" lat", start);
        if (start == -1) break;
        start += 6;
        c = ':';
        temp.delete(0, temp.length());
        for (; start < value.getLength(); start++) {
          c = value.charAt(start);
          if (c != '"') temp.append((char) c);
          else break;
        }
        if (start > value.getLength()) break;
        node.setY(Double.parseDouble(temp.toString()));
        if (node.getY() < minlat || node.getY() > maxlat) {
          continue; // Don't care about nodes outside our area of interest
        }

        start = value.find(" lon", start);
        if (start == -1) break;
        start += 6;
        c = ':';
        temp.delete(0, temp.length());
        for (; start < value.getLength(); start++) {
          c = value.charAt(start);
          if (c != '"') temp.append((char) c);
          else break;
        }
        if (start > value.getLength()) break;
        node.setX(Double.parseDouble(temp.toString()));
        if (node.getX() < minlon || node.getX() > maxlon)
          continue; // Don't care about nodes outside our area of interest
        context.write(node, id);
      }
    }