Java URLUtil Examples

Programming Language: Java

Namespace/Package Name: org.apache.nutch.util

Class/Type: URLUtil

Examples at hotexamples.com: 5

Java URLUtil - 5 examples found. These are the top rated real world Java examples of org.apache.nutch.util.URLUtil extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

chooseRepr(2)

getDomainName(2)

getHost(2)

toASCII(1)

Example #1

Show file

File: GeneratorReducer.java Project: vuquangtin/apache-nutch-2.2.2

  @Override
  protected void reduce(SelectorEntry key, Iterable<WebPage> values, Context context)
      throws IOException, InterruptedException {
    for (WebPage page : values) {
      if (count >= limit) {
        return;
      }
      if (maxCount > 0) {
        String hostordomain;
        if (byDomain) {
          hostordomain = URLUtil.getDomainName(key.url);
        } else {
          hostordomain = URLUtil.getHost(key.url);
        }

        Integer hostCount = hostCountMap.get(hostordomain);
        if (hostCount == null) {
          hostCountMap.put(hostordomain, 0);
          hostCount = 0;
        }
        if (hostCount >= maxCount) {
          return;
        }
        hostCountMap.put(hostordomain, hostCount + 1);
      }

      Mark.GENERATE_MARK.putMark(page, batchId);
      page.setBatchId(batchId);
      try {
        context.write(TableUtil.reverseUrl(key.url), page);
      } catch (MalformedURLException e) {
        context.getCounter("Generator", "MALFORMED_URL").increment(1);
        continue;
      }
      context.getCounter("Generator", "GENERATE_MARK").increment(1);
      count++;
    }
  }

Example #2

Show file

File: FetcherThread.java Project: Ccccz/nutch

  private ParseStatus output(
      Text key,
      CrawlDatum datum,
      Content content,
      ProtocolStatus pstatus,
      int status,
      int outlinkDepth) {

    datum.setStatus(status);
    datum.setFetchTime(System.currentTimeMillis());
    if (pstatus != null) datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus);

    ParseResult parseResult = null;
    if (content != null) {
      Metadata metadata = content.getMetadata();

      // store the guessed content type in the crawldatum
      if (content.getContentType() != null)
        datum
            .getMetaData()
            .put(new Text(Metadata.CONTENT_TYPE), new Text(content.getContentType()));

      // add segment to metadata
      metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName);
      // add score to content metadata so that ParseSegment can pick it up.
      try {
        scfilters.passScoreBeforeParsing(key, datum, content);
      } catch (Exception e) {
        if (LOG.isWarnEnabled()) {
          LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
        }
      }
      /*
       * Note: Fetcher will only follow meta-redirects coming from the
       * original URL.
       */
      if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) {
        if (!skipTruncated || (skipTruncated && !ParseSegment.isTruncated(content))) {
          try {
            parseResult = this.parseUtil.parse(content);
          } catch (Exception e) {
            LOG.warn("Error parsing: " + key + ": " + StringUtils.stringifyException(e));
          }
        }

        if (parseResult == null) {
          byte[] signature =
              SignatureFactory.getSignature(conf)
                  .calculate(content, new ParseStatus().getEmptyParse(conf));
          datum.setSignature(signature);
        }
      }

      /*
       * Store status code in content So we can read this value during parsing
       * (as a separate job) and decide to parse or not.
       */
      content.getMetadata().add(Nutch.FETCH_STATUS_KEY, Integer.toString(status));
    }

    try {
      output.collect(key, new NutchWritable(datum));
      if (content != null && storingContent) output.collect(key, new NutchWritable(content));
      if (parseResult != null) {
        for (Entry<Text, Parse> entry : parseResult) {
          Text url = entry.getKey();
          Parse parse = entry.getValue();
          ParseStatus parseStatus = parse.getData().getStatus();
          ParseData parseData = parse.getData();

          if (!parseStatus.isSuccess()) {
            LOG.warn("Error parsing: " + key + ": " + parseStatus);
            parse = parseStatus.getEmptyParse(conf);
          }

          // Calculate page signature. For non-parsing fetchers this will
          // be done in ParseSegment
          byte[] signature = SignatureFactory.getSignature(conf).calculate(content, parse);
          // Ensure segment name and score are in parseData metadata
          parseData.getContentMeta().set(Nutch.SEGMENT_NAME_KEY, segmentName);
          parseData.getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
          // Pass fetch time to content meta
          parseData.getContentMeta().set(Nutch.FETCH_TIME_KEY, Long.toString(datum.getFetchTime()));
          if (url.equals(key)) datum.setSignature(signature);
          try {
            scfilters.passScoreAfterParsing(url, content, parse);
          } catch (Exception e) {
            if (LOG.isWarnEnabled()) {
              LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
            }
          }

          String origin = null;

          // collect outlinks for subsequent db update
          Outlink[] links = parseData.getOutlinks();
          int outlinksToStore = Math.min(maxOutlinks, links.length);
          if (ignoreExternalLinks || ignoreInternalLinks) {
            URL originURL = new URL(url.toString());
            // based on domain?
            if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) {
              origin = URLUtil.getDomainName(originURL).toLowerCase();
            }
            // use host
            else {
              origin = originURL.getHost().toLowerCase();
            }
          }

          // used by fetchNode
          if (fetchNode != null) {
            fetchNode.setOutlinks(links);
            fetchNode.setTitle(parseData.getTitle());
            FetchNodeDb.getInstance().put(fetchNode.getUrl().toString(), fetchNode);
          }
          int validCount = 0;

          // Process all outlinks, normalize, filter and deduplicate
          List<Outlink> outlinkList = new ArrayList<Outlink>(outlinksToStore);
          HashSet<String> outlinks = new HashSet<String>(outlinksToStore);
          for (int i = 0; i < links.length && validCount < outlinksToStore; i++) {
            String toUrl = links[i].getToUrl();

            toUrl =
                ParseOutputFormat.filterNormalize(
                    url.toString(),
                    toUrl,
                    origin,
                    ignoreInternalLinks,
                    ignoreExternalLinks,
                    ignoreExternalLinksMode,
                    urlFilters,
                    urlExemptionFilters,
                    normalizers);
            if (toUrl == null) {
              continue;
            }

            validCount++;
            links[i].setUrl(toUrl);
            outlinkList.add(links[i]);
            outlinks.add(toUrl);
          }

          // Only process depth N outlinks
          if (maxOutlinkDepth > 0 && outlinkDepth < maxOutlinkDepth) {
            reporter.incrCounter("FetcherOutlinks", "outlinks_detected", outlinks.size());

            // Counter to limit num outlinks to follow per page
            int outlinkCounter = 0;

            // Calculate variable number of outlinks by depth using the
            // divisor (outlinks = Math.floor(divisor / depth * num.links))
            int maxOutlinksByDepth =
                (int)
                    Math.floor(outlinksDepthDivisor / (outlinkDepth + 1) * maxOutlinkDepthNumLinks);

            String followUrl;

            // Walk over the outlinks and add as new FetchItem to the queues
            Iterator<String> iter = outlinks.iterator();
            while (iter.hasNext() && outlinkCounter < maxOutlinkDepthNumLinks) {
              followUrl = iter.next();

              // Check whether we'll follow external outlinks
              if (outlinksIgnoreExternal) {
                if (!URLUtil.getHost(url.toString()).equals(URLUtil.getHost(followUrl))) {
                  continue;
                }
              }

              reporter.incrCounter("FetcherOutlinks", "outlinks_following", 1);

              // Create new FetchItem with depth incremented
              FetchItem fit =
                  FetchItem.create(
                      new Text(followUrl),
                      new CrawlDatum(CrawlDatum.STATUS_LINKED, interval),
                      queueMode,
                      outlinkDepth + 1);
              ((FetchItemQueues) fetchQueues).addFetchItem(fit);

              outlinkCounter++;
            }
          }

          // Overwrite the outlinks in ParseData with the normalized and
          // filtered set
          parseData.setOutlinks(outlinkList.toArray(new Outlink[outlinkList.size()]));

          output.collect(
              url,
              new NutchWritable(
                  new ParseImpl(new ParseText(parse.getText()), parseData, parse.isCanonical())));
        }
      }
    } catch (IOException e) {
      if (LOG.isErrorEnabled()) {
        LOG.error("fetcher caught:" + e.toString());
      }
    }

    // return parse status if it exits
    if (parseResult != null && !parseResult.isEmpty()) {
      Parse p = parseResult.get(content.getUrl());
      if (p != null) {
        reporter.incrCounter(
            "ParserStatus", ParseStatus.majorCodes[p.getData().getStatus().getMajorCode()], 1);
        return p.getData().getStatus();
      }
    }
    return null;
  }

Example #3

Show file

File: ParserChecker.java Project: harjiwiga/nutch-2.3.1

  public int run(String[] args) throws Exception {
    boolean dumpText = false;
    boolean force = false;
    String contentType = null;
    String url = null;

    String usage = "Usage: ParserChecker [-dumpText] [-forceAs mimeType] url";

    if (args.length == 0) {
      LOG.error(usage);
      return (-1);
    }

    for (int i = 0; i < args.length; i++) {
      if (args[i].equals("-forceAs")) {
        force = true;
        contentType = args[++i];
      } else if (args[i].equals("-dumpText")) {
        dumpText = true;
      } else if (i != args.length - 1) {
        LOG.error(usage);
        System.exit(-1);
      } else {
        url = URLUtil.toASCII(args[i]);
      }
    }

    if (LOG.isInfoEnabled()) {
      LOG.info("fetching: " + url);
    }

    ProtocolFactory factory = new ProtocolFactory(conf);
    Protocol protocol = factory.getProtocol(url);
    WebPage page = WebPage.newBuilder().build();

    ProtocolOutput protocolOutput = protocol.getProtocolOutput(url, page);

    if (!protocolOutput.getStatus().isSuccess()) {
      LOG.error(
          "Fetch failed with protocol status: "
              + ProtocolStatusUtils.getName(protocolOutput.getStatus().getCode())
              + ": "
              + ProtocolStatusUtils.getMessage(protocolOutput.getStatus()));
      return (-1);
    }
    Content content = protocolOutput.getContent();

    if (content == null) {
      LOG.error("No content for " + url);
      return (-1);
    }
    page.setBaseUrl(new org.apache.avro.util.Utf8(url));
    page.setContent(ByteBuffer.wrap(content.getContent()));

    if (force) {
      content.setContentType(contentType);
    } else {
      contentType = content.getContentType();
    }

    if (contentType == null) {
      LOG.error("Failed to determine content type!");
      return (-1);
    }

    page.setContentType(new Utf8(contentType));

    if (ParserJob.isTruncated(url, page)) {
      LOG.warn("Content is truncated, parse may fail!");
    }

    Parse parse = new ParseUtil(conf).parse(url, page);

    if (parse == null) {
      LOG.error("Problem with parse - check log");
      return (-1);
    }

    // Calculate the signature
    byte[] signature = SignatureFactory.getSignature(getConf()).calculate(page);

    if (LOG.isInfoEnabled()) {
      LOG.info("parsing: " + url);
      LOG.info("contentType: " + contentType);
      LOG.info("signature: " + StringUtil.toHexString(signature));
    }

    LOG.info("---------\nUrl\n---------------\n");
    System.out.print(url + "\n");
    LOG.info("---------\nMetadata\n---------\n");
    Map<CharSequence, ByteBuffer> metadata = page.getMetadata();
    StringBuffer sb = new StringBuffer();
    if (metadata != null) {
      Iterator<Entry<CharSequence, ByteBuffer>> iterator = metadata.entrySet().iterator();
      while (iterator.hasNext()) {
        Entry<CharSequence, ByteBuffer> entry = iterator.next();
        sb.append(entry.getKey().toString())
            .append(" : \t")
            .append(Bytes.toString(entry.getValue()))
            .append("\n");
      }
      System.out.print(sb.toString());
    }
    LOG.info("---------\nOutlinks\n---------\n");
    sb = new StringBuffer();
    for (Outlink l : parse.getOutlinks()) {
      sb.append("  outlink: ").append(l).append('\n');
    }
    System.out.print(sb.toString());
    if (page.getHeaders() != null) {
      LOG.info("---------\nHeaders\n---------\n");
      Map<CharSequence, CharSequence> headers = page.getHeaders();
      StringBuffer headersb = new StringBuffer();
      if (metadata != null) {
        Iterator<Entry<CharSequence, CharSequence>> iterator = headers.entrySet().iterator();
        while (iterator.hasNext()) {
          Entry<CharSequence, CharSequence> entry = iterator.next();
          headersb
              .append(entry.getKey().toString())
              .append(" : \t")
              .append(entry.getValue())
              .append("\n");
        }
        System.out.print(headersb.toString());
      }
    }
    if (dumpText) {
      LOG.info("---------\nParseText\n---------\n");
      System.out.print(parse.getText());
    }

    return 0;
  }

Example #4

Show file

File: FetcherThread.java Project: Ccccz/nutch

  private Text handleRedirect(
      Text url, CrawlDatum datum, String urlString, String newUrl, boolean temp, String redirType)
      throws MalformedURLException, URLFilterException {
    newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
    newUrl = urlFilters.filter(newUrl);

    try {
      String origHost = new URL(urlString).getHost().toLowerCase();
      String newHost = new URL(newUrl).getHost().toLowerCase();
      if (ignoreExternalLinks) {
        if (!origHost.equals(newHost)) {
          if (LOG.isDebugEnabled()) {
            LOG.debug(
                " - ignoring redirect "
                    + redirType
                    + " from "
                    + urlString
                    + " to "
                    + newUrl
                    + " because external links are ignored");
          }
          return null;
        }
      }

      if (ignoreInternalLinks) {
        if (origHost.equals(newHost)) {
          if (LOG.isDebugEnabled()) {
            LOG.debug(
                " - ignoring redirect "
                    + redirType
                    + " from "
                    + urlString
                    + " to "
                    + newUrl
                    + " because internal links are ignored");
          }
          return null;
        }
      }
    } catch (MalformedURLException e) {
    }

    if (newUrl != null && !newUrl.equals(urlString)) {
      reprUrl = URLUtil.chooseRepr(reprUrl, newUrl, temp);
      url = new Text(newUrl);
      if (maxRedirect > 0) {
        redirecting = true;
        redirectCount++;
        if (LOG.isDebugEnabled()) {
          LOG.debug(" - " + redirType + " redirect to " + url + " (fetching now)");
        }
        return url;
      } else {
        CrawlDatum newDatum =
            new CrawlDatum(CrawlDatum.STATUS_LINKED, datum.getFetchInterval(), datum.getScore());
        // transfer existing metadata
        newDatum.getMetaData().putAll(datum.getMetaData());
        try {
          scfilters.initialScore(url, newDatum);
        } catch (ScoringFilterException e) {
          e.printStackTrace();
        }
        if (reprUrl != null) {
          newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, new Text(reprUrl));
        }
        output(url, newDatum, null, null, CrawlDatum.STATUS_LINKED);
        if (LOG.isDebugEnabled()) {
          LOG.debug(" - " + redirType + " redirect to " + url + " (fetching later)");
        }
        return null;
      }
    } else {
      if (LOG.isDebugEnabled()) {
        LOG.debug(
            " - "
                + redirType
                + " redirect skipped: "
                + (newUrl != null ? "to same url" : "filtered"));
      }
      return null;
    }
  }

Example #5

Show file

File: ParseUtil.java Project: zhuchi/nutch-ajax

  /**
   * Parses given web page and stores parsed content within page. Puts a meta-redirect to outlinks.
   *
   * @param key
   * @param page
   */
  public void process(String key, WebPage page) {
    String url = TableUtil.unreverseUrl(key);
    byte status = page.getStatus().byteValue();
    if (status != CrawlStatus.STATUS_FETCHED) {
      if (LOG.isDebugEnabled()) {
        LOG.debug("Skipping " + url + " as status is: " + CrawlStatus.getName(status));
      }
      return;
    }

    Parse parse;
    try {
      parse = parse(url, page);
    } catch (ParserNotFound e) {
      // do not print stacktrace for the fact that some types are not mapped.
      LOG.warn("No suitable parser found: " + e.getMessage());
      return;
    } catch (final Exception e) {
      LOG.warn("Error parsing: " + url + ": " + StringUtils.stringifyException(e));
      return;
    }

    if (parse == null) {
      return;
    }

    org.apache.nutch.storage.ParseStatus pstatus = parse.getParseStatus();
    page.setParseStatus(pstatus);
    if (ParseStatusUtils.isSuccess(pstatus)) {
      if (pstatus.getMinorCode() == ParseStatusCodes.SUCCESS_REDIRECT) {
        String newUrl = ParseStatusUtils.getMessage(pstatus);
        int refreshTime = Integer.parseInt(ParseStatusUtils.getArg(pstatus, 1));
        try {
          newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
          if (newUrl == null) {
            LOG.warn("redirect normalized to null " + url);
            return;
          }
          try {
            newUrl = filters.filter(newUrl);
          } catch (URLFilterException e) {
            return;
          }
          if (newUrl == null) {
            LOG.warn("redirect filtered to null " + url);
            return;
          }
        } catch (MalformedURLException e) {
          LOG.warn("malformed url exception parsing redirect " + url);
          return;
        }
        page.getOutlinks().put(new Utf8(newUrl), new Utf8());
        page.getMetadata().put(FetcherJob.REDIRECT_DISCOVERED, TableUtil.YES_VAL);
        if (newUrl == null || newUrl.equals(url)) {
          String reprUrl =
              URLUtil.chooseRepr(url, newUrl, refreshTime < FetcherJob.PERM_REFRESH_TIME);
          if (reprUrl == null) {
            LOG.warn("reprUrl==null for " + url);
            return;
          } else {
            page.setReprUrl(new Utf8(reprUrl));
          }
        }
      } else {
        page.setText(new Utf8(parse.getText()));
        page.setTitle(new Utf8(parse.getTitle()));
        ByteBuffer prevSig = page.getSignature();
        if (prevSig != null) {
          page.setPrevSignature(prevSig);
        }
        final byte[] signature = sig.calculate(page);
        page.setSignature(ByteBuffer.wrap(signature));
        if (page.getOutlinks() != null) {
          page.getOutlinks().clear();
        }
        final Outlink[] outlinks = parse.getOutlinks();
        int outlinksToStore = Math.min(maxOutlinks, outlinks.length);
        String fromHost;
        if (ignoreExternalLinks) {
          try {
            fromHost = new URL(url).getHost().toLowerCase();
          } catch (final MalformedURLException e) {
            fromHost = null;
          }
        } else {
          fromHost = null;
        }
        int validCount = 0;

        for (int i = 0; validCount < outlinksToStore && i < outlinks.length; i++) {
          String toUrl = outlinks[i].getToUrl();
          try {
            toUrl = normalizers.normalize(toUrl, URLNormalizers.SCOPE_OUTLINK);
            toUrl = filters.filter(toUrl);
          } catch (MalformedURLException e2) {
            continue;
          } catch (URLFilterException e) {
            continue;
          }
          if (toUrl == null) {
            continue;
          }
          Utf8 utf8ToUrl = new Utf8(toUrl);
          if (page.getOutlinks().get(utf8ToUrl) != null) {
            // skip duplicate outlinks
            continue;
          }
          String toHost;
          if (ignoreExternalLinks) {
            try {
              toHost = new URL(toUrl).getHost().toLowerCase();
            } catch (final MalformedURLException e) {
              toHost = null;
            }
            if (toHost == null || !toHost.equals(fromHost)) { // external links
              continue; // skip it
            }
          }
          validCount++;
          page.getOutlinks().put(utf8ToUrl, new Utf8(outlinks[i].getAnchor()));
        }
        Utf8 fetchMark = Mark.FETCH_MARK.checkMark(page);
        if (fetchMark != null) {
          Mark.PARSE_MARK.putMark(page, fetchMark);
        }
      }
    }
  }