public NutchDocument filter(
      NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
      throws IndexingException {
    Text reprUrl = (Text) datum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
    String reprUrlString = reprUrl != null ? reprUrl.toString() : null;
    String urlString = url.toString();

    String host = null;
    try {
      URL u;
      if (reprUrlString != null) {
        u = new URL(reprUrlString);
      } else {
        u = new URL(urlString);
      }
      host = u.getHost();
    } catch (MalformedURLException e) {
      throw new IndexingException(e);
    }

    if (host != null) {
      doc.add("host", host);
    }

    doc.add("url", reprUrlString == null ? urlString : reprUrlString);

    // content
    String content = parse.getText();
    if (MAX_CONTENT_LENGTH > -1 && content.length() > MAX_CONTENT_LENGTH) {
      content = content.substring(0, MAX_CONTENT_LENGTH);
    }
    doc.add("content", content);

    // title
    String title = parse.getData().getTitle();
    if (title.length() > MAX_TITLE_LENGTH) { // truncate title if needed
      title = title.substring(0, MAX_TITLE_LENGTH);
    }

    if (title.length() > 0) {
      // NUTCH-1004 Do not index empty values for title field
      doc.add("title", title);
    }

    // add cached content/summary display policy, if available
    String caching = parse.getData().getMeta(Nutch.CACHING_FORBIDDEN_KEY);
    if (caching != null && !caching.equals(Nutch.CACHING_FORBIDDEN_NONE)) {
      doc.add("cache", caching);
    }

    // add timestamp when fetched, for deduplication
    doc.add("tstamp", new Date(datum.getFetchTime()));

    return doc;
  }
Exemple #2
0
  /**
   * Creates a {@link FtpResponse} object corresponding to the url and returns a {@link
   * ProtocolOutput} object as per the content received
   *
   * @param url Text containing the ftp url
   * @param datum The CrawlDatum object corresponding to the url
   * @return {@link ProtocolOutput} object for the url
   */
  public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
    String urlString = url.toString();
    try {
      URL u = new URL(urlString);

      int redirects = 0;

      while (true) {
        FtpResponse response;
        response = new FtpResponse(u, datum, this, getConf()); // make a request

        int code = response.getCode();
        datum.getMetaData().put(Nutch.PROTOCOL_STATUS_CODE_KEY, new Text(Integer.toString(code)));

        if (code == 200) { // got a good response
          return new ProtocolOutput(response.toContent()); // return it

        } else if (code >= 300 && code < 400) { // handle redirect
          if (redirects == MAX_REDIRECTS) throw new FtpException("Too many redirects: " + url);
          u = new URL(response.getHeader("Location"));
          redirects++;
          if (LOG.isTraceEnabled()) {
            LOG.trace("redirect to " + u);
          }
        } else { // convert to exception
          throw new FtpError(code);
        }
      }
    } catch (Exception e) {
      return new ProtocolOutput(null, new ProtocolStatus(e));
    }
  }
 public void getStats(Path segment, final SegmentReaderStats stats) throws Exception {
   SequenceFile.Reader[] readers =
       SequenceFileOutputFormat.getReaders(
           getConf(), new Path(segment, CrawlDatum.GENERATE_DIR_NAME));
   long cnt = 0L;
   Text key = new Text();
   for (int i = 0; i < readers.length; i++) {
     while (readers[i].next(key)) cnt++;
     readers[i].close();
   }
   stats.generated = cnt;
   Path fetchDir = new Path(segment, CrawlDatum.FETCH_DIR_NAME);
   if (fs.exists(fetchDir) && fs.getFileStatus(fetchDir).isDir()) {
     cnt = 0L;
     long start = Long.MAX_VALUE;
     long end = Long.MIN_VALUE;
     CrawlDatum value = new CrawlDatum();
     MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(fs, fetchDir, getConf());
     for (int i = 0; i < mreaders.length; i++) {
       while (mreaders[i].next(key, value)) {
         cnt++;
         if (value.getFetchTime() < start) start = value.getFetchTime();
         if (value.getFetchTime() > end) end = value.getFetchTime();
       }
       mreaders[i].close();
     }
     stats.start = start;
     stats.end = end;
     stats.fetched = cnt;
   }
   Path parseDir = new Path(segment, ParseData.DIR_NAME);
   if (fs.exists(fetchDir) && fs.getFileStatus(fetchDir).isDir()) {
     cnt = 0L;
     long errors = 0L;
     ParseData value = new ParseData();
     MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(fs, parseDir, getConf());
     for (int i = 0; i < mreaders.length; i++) {
       while (mreaders[i].next(key, value)) {
         cnt++;
         if (!value.getStatus().isSuccess()) errors++;
       }
       mreaders[i].close();
     }
     stats.parsed = cnt;
     stats.parseErrors = errors;
   }
 }
Exemple #4
0
 private FetchItem queueRedirect(Text redirUrl, FetchItem fit) throws ScoringFilterException {
   CrawlDatum newDatum =
       new CrawlDatum(
           CrawlDatum.STATUS_DB_UNFETCHED, fit.datum.getFetchInterval(), fit.datum.getScore());
   // transfer all existing metadata to the redirect
   newDatum.getMetaData().putAll(fit.datum.getMetaData());
   scfilters.initialScore(redirUrl, newDatum);
   if (reprUrl != null) {
     newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, new Text(reprUrl));
   }
   fit = FetchItem.create(redirUrl, newDatum, queueMode);
   if (fit != null) {
     FetchItemQueue fiq = ((FetchItemQueues) fetchQueues).getFetchItemQueue(fit.queueID);
     fiq.addInProgressFetchItem(fit);
   } else {
     // stop redirecting
     redirecting = false;
     reporter.incrCounter("FetcherStatus", "FetchItem.notCreated.redirect", 1);
   }
   return fit;
 }
Exemple #5
0
  private ParseStatus output(
      Text key,
      CrawlDatum datum,
      Content content,
      ProtocolStatus pstatus,
      int status,
      int outlinkDepth) {

    datum.setStatus(status);
    datum.setFetchTime(System.currentTimeMillis());
    if (pstatus != null) datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus);

    ParseResult parseResult = null;
    if (content != null) {
      Metadata metadata = content.getMetadata();

      // store the guessed content type in the crawldatum
      if (content.getContentType() != null)
        datum
            .getMetaData()
            .put(new Text(Metadata.CONTENT_TYPE), new Text(content.getContentType()));

      // add segment to metadata
      metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName);
      // add score to content metadata so that ParseSegment can pick it up.
      try {
        scfilters.passScoreBeforeParsing(key, datum, content);
      } catch (Exception e) {
        if (LOG.isWarnEnabled()) {
          LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
        }
      }
      /*
       * Note: Fetcher will only follow meta-redirects coming from the
       * original URL.
       */
      if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) {
        if (!skipTruncated || (skipTruncated && !ParseSegment.isTruncated(content))) {
          try {
            parseResult = this.parseUtil.parse(content);
          } catch (Exception e) {
            LOG.warn("Error parsing: " + key + ": " + StringUtils.stringifyException(e));
          }
        }

        if (parseResult == null) {
          byte[] signature =
              SignatureFactory.getSignature(conf)
                  .calculate(content, new ParseStatus().getEmptyParse(conf));
          datum.setSignature(signature);
        }
      }

      /*
       * Store status code in content So we can read this value during parsing
       * (as a separate job) and decide to parse or not.
       */
      content.getMetadata().add(Nutch.FETCH_STATUS_KEY, Integer.toString(status));
    }

    try {
      output.collect(key, new NutchWritable(datum));
      if (content != null && storingContent) output.collect(key, new NutchWritable(content));
      if (parseResult != null) {
        for (Entry<Text, Parse> entry : parseResult) {
          Text url = entry.getKey();
          Parse parse = entry.getValue();
          ParseStatus parseStatus = parse.getData().getStatus();
          ParseData parseData = parse.getData();

          if (!parseStatus.isSuccess()) {
            LOG.warn("Error parsing: " + key + ": " + parseStatus);
            parse = parseStatus.getEmptyParse(conf);
          }

          // Calculate page signature. For non-parsing fetchers this will
          // be done in ParseSegment
          byte[] signature = SignatureFactory.getSignature(conf).calculate(content, parse);
          // Ensure segment name and score are in parseData metadata
          parseData.getContentMeta().set(Nutch.SEGMENT_NAME_KEY, segmentName);
          parseData.getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
          // Pass fetch time to content meta
          parseData.getContentMeta().set(Nutch.FETCH_TIME_KEY, Long.toString(datum.getFetchTime()));
          if (url.equals(key)) datum.setSignature(signature);
          try {
            scfilters.passScoreAfterParsing(url, content, parse);
          } catch (Exception e) {
            if (LOG.isWarnEnabled()) {
              LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
            }
          }

          String origin = null;

          // collect outlinks for subsequent db update
          Outlink[] links = parseData.getOutlinks();
          int outlinksToStore = Math.min(maxOutlinks, links.length);
          if (ignoreExternalLinks || ignoreInternalLinks) {
            URL originURL = new URL(url.toString());
            // based on domain?
            if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) {
              origin = URLUtil.getDomainName(originURL).toLowerCase();
            }
            // use host
            else {
              origin = originURL.getHost().toLowerCase();
            }
          }

          // used by fetchNode
          if (fetchNode != null) {
            fetchNode.setOutlinks(links);
            fetchNode.setTitle(parseData.getTitle());
            FetchNodeDb.getInstance().put(fetchNode.getUrl().toString(), fetchNode);
          }
          int validCount = 0;

          // Process all outlinks, normalize, filter and deduplicate
          List<Outlink> outlinkList = new ArrayList<Outlink>(outlinksToStore);
          HashSet<String> outlinks = new HashSet<String>(outlinksToStore);
          for (int i = 0; i < links.length && validCount < outlinksToStore; i++) {
            String toUrl = links[i].getToUrl();

            toUrl =
                ParseOutputFormat.filterNormalize(
                    url.toString(),
                    toUrl,
                    origin,
                    ignoreInternalLinks,
                    ignoreExternalLinks,
                    ignoreExternalLinksMode,
                    urlFilters,
                    urlExemptionFilters,
                    normalizers);
            if (toUrl == null) {
              continue;
            }

            validCount++;
            links[i].setUrl(toUrl);
            outlinkList.add(links[i]);
            outlinks.add(toUrl);
          }

          // Only process depth N outlinks
          if (maxOutlinkDepth > 0 && outlinkDepth < maxOutlinkDepth) {
            reporter.incrCounter("FetcherOutlinks", "outlinks_detected", outlinks.size());

            // Counter to limit num outlinks to follow per page
            int outlinkCounter = 0;

            // Calculate variable number of outlinks by depth using the
            // divisor (outlinks = Math.floor(divisor / depth * num.links))
            int maxOutlinksByDepth =
                (int)
                    Math.floor(outlinksDepthDivisor / (outlinkDepth + 1) * maxOutlinkDepthNumLinks);

            String followUrl;

            // Walk over the outlinks and add as new FetchItem to the queues
            Iterator<String> iter = outlinks.iterator();
            while (iter.hasNext() && outlinkCounter < maxOutlinkDepthNumLinks) {
              followUrl = iter.next();

              // Check whether we'll follow external outlinks
              if (outlinksIgnoreExternal) {
                if (!URLUtil.getHost(url.toString()).equals(URLUtil.getHost(followUrl))) {
                  continue;
                }
              }

              reporter.incrCounter("FetcherOutlinks", "outlinks_following", 1);

              // Create new FetchItem with depth incremented
              FetchItem fit =
                  FetchItem.create(
                      new Text(followUrl),
                      new CrawlDatum(CrawlDatum.STATUS_LINKED, interval),
                      queueMode,
                      outlinkDepth + 1);
              ((FetchItemQueues) fetchQueues).addFetchItem(fit);

              outlinkCounter++;
            }
          }

          // Overwrite the outlinks in ParseData with the normalized and
          // filtered set
          parseData.setOutlinks(outlinkList.toArray(new Outlink[outlinkList.size()]));

          output.collect(
              url,
              new NutchWritable(
                  new ParseImpl(new ParseText(parse.getText()), parseData, parse.isCanonical())));
        }
      }
    } catch (IOException e) {
      if (LOG.isErrorEnabled()) {
        LOG.error("fetcher caught:" + e.toString());
      }
    }

    // return parse status if it exits
    if (parseResult != null && !parseResult.isEmpty()) {
      Parse p = parseResult.get(content.getUrl());
      if (p != null) {
        reporter.incrCounter(
            "ParserStatus", ParseStatus.majorCodes[p.getData().getStatus().getMajorCode()], 1);
        return p.getData().getStatus();
      }
    }
    return null;
  }
Exemple #6
0
  private Text handleRedirect(
      Text url, CrawlDatum datum, String urlString, String newUrl, boolean temp, String redirType)
      throws MalformedURLException, URLFilterException {
    newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
    newUrl = urlFilters.filter(newUrl);

    try {
      String origHost = new URL(urlString).getHost().toLowerCase();
      String newHost = new URL(newUrl).getHost().toLowerCase();
      if (ignoreExternalLinks) {
        if (!origHost.equals(newHost)) {
          if (LOG.isDebugEnabled()) {
            LOG.debug(
                " - ignoring redirect "
                    + redirType
                    + " from "
                    + urlString
                    + " to "
                    + newUrl
                    + " because external links are ignored");
          }
          return null;
        }
      }

      if (ignoreInternalLinks) {
        if (origHost.equals(newHost)) {
          if (LOG.isDebugEnabled()) {
            LOG.debug(
                " - ignoring redirect "
                    + redirType
                    + " from "
                    + urlString
                    + " to "
                    + newUrl
                    + " because internal links are ignored");
          }
          return null;
        }
      }
    } catch (MalformedURLException e) {
    }

    if (newUrl != null && !newUrl.equals(urlString)) {
      reprUrl = URLUtil.chooseRepr(reprUrl, newUrl, temp);
      url = new Text(newUrl);
      if (maxRedirect > 0) {
        redirecting = true;
        redirectCount++;
        if (LOG.isDebugEnabled()) {
          LOG.debug(" - " + redirType + " redirect to " + url + " (fetching now)");
        }
        return url;
      } else {
        CrawlDatum newDatum =
            new CrawlDatum(CrawlDatum.STATUS_LINKED, datum.getFetchInterval(), datum.getScore());
        // transfer existing metadata
        newDatum.getMetaData().putAll(datum.getMetaData());
        try {
          scfilters.initialScore(url, newDatum);
        } catch (ScoringFilterException e) {
          e.printStackTrace();
        }
        if (reprUrl != null) {
          newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, new Text(reprUrl));
        }
        output(url, newDatum, null, null, CrawlDatum.STATUS_LINKED);
        if (LOG.isDebugEnabled()) {
          LOG.debug(" - " + redirType + " redirect to " + url + " (fetching later)");
        }
        return null;
      }
    } else {
      if (LOG.isDebugEnabled()) {
        LOG.debug(
            " - "
                + redirType
                + " redirect skipped: "
                + (newUrl != null ? "to same url" : "filtered"));
      }
      return null;
    }
  }
 /**
  * NOTE: in selecting the latest version we rely exclusively on the segment name (not all segment
  * data contain time information). Therefore it is extremely important that segments be named in
  * an increasing lexicographic order as their creation time increases.
  */
 public void reduce(
     WritableComparable key, Iterator values, OutputCollector output, Reporter reporter)
     throws IOException {
   CrawlDatum lastG = null;
   CrawlDatum lastF = null;
   CrawlDatum lastSig = null;
   Content lastC = null;
   ParseData lastPD = null;
   ParseText lastPT = null;
   String lastGname = null;
   String lastFname = null;
   String lastSigname = null;
   String lastCname = null;
   String lastPDname = null;
   String lastPTname = null;
   TreeMap linked = new TreeMap();
   while (values.hasNext()) {
     MetaWrapper wrapper = (MetaWrapper) values.next();
     Object o = wrapper.get();
     String spString = wrapper.getMeta(SEGMENT_PART_KEY);
     if (spString == null) {
       throw new IOException("Null segment part, key=" + key);
     }
     SegmentPart sp = SegmentPart.parse(spString);
     if (o instanceof CrawlDatum) {
       CrawlDatum val = (CrawlDatum) o;
       // check which output dir it belongs to
       if (sp.partName.equals(CrawlDatum.GENERATE_DIR_NAME)) {
         if (lastG == null) {
           lastG = val;
           lastGname = sp.segmentName;
         } else {
           // take newer
           if (lastGname.compareTo(sp.segmentName) < 0) {
             lastG = val;
             lastGname = sp.segmentName;
           }
         }
       } else if (sp.partName.equals(CrawlDatum.FETCH_DIR_NAME)) {
         if (lastF == null) {
           lastF = val;
           lastFname = sp.segmentName;
         } else {
           // take newer
           if (lastFname.compareTo(sp.segmentName) < 0) {
             lastF = val;
             lastFname = sp.segmentName;
           }
         }
       } else if (sp.partName.equals(CrawlDatum.PARSE_DIR_NAME)) {
         if (val.getStatus() == CrawlDatum.STATUS_SIGNATURE) {
           if (lastSig == null) {
             lastSig = val;
             lastSigname = sp.segmentName;
           } else {
             // take newer
             if (lastSigname.compareTo(sp.segmentName) < 0) {
               lastSig = val;
               lastSigname = sp.segmentName;
             }
           }
           continue;
         }
         // collect all LINKED values from the latest segment
         ArrayList segLinked = (ArrayList) linked.get(sp.segmentName);
         if (segLinked == null) {
           segLinked = new ArrayList();
           linked.put(sp.segmentName, segLinked);
         }
         segLinked.add(val);
       } else {
         throw new IOException("Cannot determine segment part: " + sp.partName);
       }
     } else if (o instanceof Content) {
       if (lastC == null) {
         lastC = (Content) o;
         lastCname = sp.segmentName;
       } else {
         if (lastCname.compareTo(sp.segmentName) < 0) {
           lastC = (Content) o;
           lastCname = sp.segmentName;
         }
       }
     } else if (o instanceof ParseData) {
       if (lastPD == null) {
         lastPD = (ParseData) o;
         lastPDname = sp.segmentName;
       } else {
         if (lastPDname.compareTo(sp.segmentName) < 0) {
           lastPD = (ParseData) o;
           lastPDname = sp.segmentName;
         }
       }
     } else if (o instanceof ParseText) {
       if (lastPT == null) {
         lastPT = (ParseText) o;
         lastPTname = sp.segmentName;
       } else {
         if (lastPTname.compareTo(sp.segmentName) < 0) {
           lastPT = (ParseText) o;
           lastPTname = sp.segmentName;
         }
       }
     }
   }
   curCount++;
   String sliceName = null;
   MetaWrapper wrapper = new MetaWrapper();
   if (sliceSize > 0) {
     sliceName = String.valueOf(curCount / sliceSize);
     wrapper.setMeta(SEGMENT_SLICE_KEY, sliceName);
   }
   SegmentPart sp = new SegmentPart();
   // now output the latest values
   if (lastG != null) {
     wrapper.set(lastG);
     sp.partName = CrawlDatum.GENERATE_DIR_NAME;
     sp.segmentName = lastGname;
     wrapper.setMeta(SEGMENT_PART_KEY, sp.toString());
     output.collect(key, wrapper);
   }
   if (lastF != null) {
     wrapper.set(lastF);
     sp.partName = CrawlDatum.FETCH_DIR_NAME;
     sp.segmentName = lastFname;
     wrapper.setMeta(SEGMENT_PART_KEY, sp.toString());
     output.collect(key, wrapper);
   }
   if (lastSig != null) {
     wrapper.set(lastSig);
     sp.partName = CrawlDatum.PARSE_DIR_NAME;
     sp.segmentName = lastSigname;
     wrapper.setMeta(SEGMENT_PART_KEY, sp.toString());
     output.collect(key, wrapper);
   }
   if (lastC != null) {
     wrapper.set(lastC);
     sp.partName = Content.DIR_NAME;
     sp.segmentName = lastCname;
     wrapper.setMeta(SEGMENT_PART_KEY, sp.toString());
     output.collect(key, wrapper);
   }
   if (lastPD != null) {
     wrapper.set(lastPD);
     sp.partName = ParseData.DIR_NAME;
     sp.segmentName = lastPDname;
     wrapper.setMeta(SEGMENT_PART_KEY, sp.toString());
     output.collect(key, wrapper);
   }
   if (lastPT != null) {
     wrapper.set(lastPT);
     sp.partName = ParseText.DIR_NAME;
     sp.segmentName = lastPTname;
     wrapper.setMeta(SEGMENT_PART_KEY, sp.toString());
     output.collect(key, wrapper);
   }
   if (linked.size() > 0) {
     String name = (String) linked.lastKey();
     sp.partName = CrawlDatum.PARSE_DIR_NAME;
     sp.segmentName = name;
     wrapper.setMeta(SEGMENT_PART_KEY, sp.toString());
     ArrayList segLinked = (ArrayList) linked.get(name);
     for (int i = 0; i < segLinked.size(); i++) {
       CrawlDatum link = (CrawlDatum) segLinked.get(i);
       wrapper.set(link);
       output.collect(key, wrapper);
     }
   }
 }