예제 #1
0
 @Override
 public void execute(Tuple tuple, BasicOutputCollector collector) {
   String word = tuple.getString(0);
   Integer count = counts.get(word);
   if (count == null) count = 0;
   count++;
   counts.put(word, count);
   collector.emit(new Values(word, count));
 }
예제 #2
0
 @Override
 public void execute(Tuple input, BasicOutputCollector collector) {
   String line = input.getString(0);
   String[] fields = line.split(seprator);
   try {
     pstmt = conn.prepareStatement(insertSql);
     for (int i = 0; i < fields.length; i++) {
       pstmt.setString(i + 1, fields[i]);
     }
     pstmt.executeUpdate();
   } catch (SQLException e) {
     log.error("数据插入失败", e);
     throw new RuntimeException("数据插入失败", e);
   }
   collector.emit(new Values(line));
 }
 @Override
 public void execute(Tuple tuple) {
   Long val = (Long) tuple.getValue(0);
   collector.emit(new Values(2 * (val + 1))); // increment and double value
   collector.ack(tuple);
 }
예제 #4
0
 @Override
 public void execute(Tuple input, BasicOutputCollector collector) {
   collector.emit(input.getValues());
 }
 @Override
 public void execute(Tuple tuple) {
   Object key = tuple.getValue(1);
   Number curr = Utils.get(_sums, key, 0);
   _sums.put(key, Numbers.add(curr, tuple.getValue(2)));
 }
예제 #6
0
  @Override
  public void execute(Tuple tuple) {
    Metadata metadata = (Metadata) tuple.getValueByField("metadata");
    byte[] content = tuple.getBinaryByField("content");
    String url = tuple.getStringByField("url");

    boolean isfeed = Boolean.valueOf(metadata.getFirstValue(isFeedKey));
    // doesn't have the metadata expected
    if (!isfeed) {
      if (sniffWhenNoMDKey) {
        // uses mime-type
        // won't work when servers return text/xml
        // TODO use Tika instead?
        String ct = metadata.getFirstValue(HttpHeaders.CONTENT_TYPE);
        if (ct.contains("rss+xml")) isfeed = true;
      }
    }

    // still not a feed file
    if (!isfeed) {
      // just pass it on
      this.collector.emit(tuple, tuple.getValues());
      this.collector.ack(tuple);
      return;
    } else {
      // can be used later on for custom scheduling
      metadata.setValue(isFeedKey, "true");
    }

    List<Outlink> outlinks;
    try {
      outlinks = parseFeed(url, content, metadata);
    } catch (Exception e) {
      // exception while parsing the feed
      String errorMessage = "Exception while parsing " + url + ": " + e;
      LOG.error(errorMessage);
      // send to status stream in case another component wants to update
      // its status
      metadata.setValue(Constants.STATUS_ERROR_SOURCE, "feed parsing");
      metadata.setValue(Constants.STATUS_ERROR_MESSAGE, errorMessage);
      collector.emit(Constants.StatusStreamName, tuple, new Values(url, metadata, Status.ERROR));
      this.collector.ack(tuple);
      return;
    }

    // apply the parse filters if any to the current document
    try {
      ParseResult parse = new ParseResult();
      parse.setOutlinks(outlinks);
      ParseData parseData = parse.get(url);
      parseData.setMetadata(metadata);
      parseFilters.filter(url, content, null, parse);
    } catch (RuntimeException e) {
      String errorMessage = "Exception while running parse filters on " + url + ": " + e;
      LOG.error(errorMessage);
      metadata.setValue(Constants.STATUS_ERROR_SOURCE, "content filtering");
      metadata.setValue(Constants.STATUS_ERROR_MESSAGE, errorMessage);
      collector.emit(StatusStreamName, tuple, new Values(url, metadata, Status.ERROR));
      collector.ack(tuple);
      return;
    }

    // send to status stream
    for (Outlink ol : outlinks) {
      Values v = new Values(ol.getTargetURL(), ol.getMetadata(), Status.DISCOVERED);
      collector.emit(Constants.StatusStreamName, tuple, v);
    }

    // marking the main URL as successfully fetched
    // regardless of whether we got a parse exception or not
    collector.emit(Constants.StatusStreamName, tuple, new Values(url, metadata, Status.FETCHED));
    this.collector.ack(tuple);
  }