예제 #1
0
 /** Get cash on hand, divide it by the number of outlinks and apply. */
 @Override
 public void distributeScoreToOutlinks(
     String fromUrl, WebPage row, Collection<ScoreDatum> scoreData, int allCount) {
   ByteBuffer cashRaw = row.getMetadata().get(CASH_KEY);
   if (cashRaw == null) {
     return;
   }
   float cash = Bytes.toFloat(cashRaw.array(), cashRaw.arrayOffset() + cashRaw.position());
   if (cash == 0) {
     return;
   }
   // TODO: count filtered vs. all count for outlinks
   float scoreUnit = cash / allCount;
   // internal and external score factor
   float internalScore = scoreUnit * internalScoreFactor;
   float externalScore = scoreUnit * externalScoreFactor;
   for (ScoreDatum scoreDatum : scoreData) {
     try {
       String toHost = new URL(scoreDatum.getUrl()).getHost();
       String fromHost = new URL(fromUrl.toString()).getHost();
       if (toHost.equalsIgnoreCase(fromHost)) {
         scoreDatum.setScore(internalScore);
       } else {
         scoreDatum.setScore(externalScore);
       }
     } catch (MalformedURLException e) {
       LOG.error("Failed with the following MalformedURLException: ", e);
       scoreDatum.setScore(externalScore);
     }
   }
   // reset cash to zero
   row.getMetadata().put(CASH_KEY, ByteBuffer.wrap(Bytes.toBytes(0.0f)));
 }
예제 #2
0
  /** Increase the score by a sum of inlinked scores. */
  @Override
  public void updateScore(String url, WebPage page, List<ScoreDatum> inlinkedScoreData) {
    float score = page.getScore();
    for (ScoreDatum scoreDatum : inlinkedScoreData) {
      LOG.trace("adding <" + scoreDatum.getUrl() + ", " + scoreDatum.getScore() + ">");
      score += scoreDatum.getScore();
    }
    LOG.trace(url + ": " + score + " (" + page.getScore() + ")");
    page.setScore(score);

    ByteBuffer cashRaw = page.getMetadata().get(CASH_KEY);
    float cash = 1.0f;
    if (cashRaw != null) {
      cash = Bytes.toFloat(cashRaw.array(), cashRaw.arrayOffset() + cashRaw.position());
    }
    page.getMetadata().put(CASH_KEY, ByteBuffer.wrap(Bytes.toBytes(cash + score)));
  }
  /**
   * The {@link RelTagIndexingFilter} filter object.
   *
   * @param doc The {@link NutchDocument} object
   * @param url URL to be filtered for rel-tag's
   * @param page {@link WebPage} object relative to the URL
   * @return filtered NutchDocument
   */
  @Override
  public NutchDocument filter(NutchDocument doc, String url, WebPage page)
      throws IndexingException {
    // Check if some Rel-Tags found, possibly put there by RelTagParser
    ByteBuffer bb = page.getFromMetadata(new Utf8(RelTagParser.REL_TAG));

    if (bb != null) {
      String[] tags = Bytes.toString(bb).split("\t");
      for (int i = 0; i < tags.length; i++) {
        doc.add("tag", tags[i]);
      }
    }
    return doc;
  }
예제 #4
0
 /**
  * Set to 1.0f. The initial value should equal the injected value, and it should (obviously) be
  * non-zero.
  */
 @Override
 public void initialScore(String url, WebPage row) throws ScoringFilterException {
   row.setScore(1.0f);
   row.getMetadata().put(CASH_KEY, ByteBuffer.wrap(Bytes.toBytes(1.0f)));
 }
예제 #5
0
 @Override
 public void injectedScore(String url, WebPage row) throws ScoringFilterException {
   float score = row.getScore();
   row.getMetadata().put(CASH_KEY, ByteBuffer.wrap(Bytes.toBytes(score)));
 }
예제 #6
0
  public int run(String[] args) throws Exception {
    boolean dumpText = false;
    boolean force = false;
    String contentType = null;
    String url = null;

    String usage = "Usage: ParserChecker [-dumpText] [-forceAs mimeType] url";

    if (args.length == 0) {
      LOG.error(usage);
      return (-1);
    }

    for (int i = 0; i < args.length; i++) {
      if (args[i].equals("-forceAs")) {
        force = true;
        contentType = args[++i];
      } else if (args[i].equals("-dumpText")) {
        dumpText = true;
      } else if (i != args.length - 1) {
        LOG.error(usage);
        System.exit(-1);
      } else {
        url = URLUtil.toASCII(args[i]);
      }
    }

    if (LOG.isInfoEnabled()) {
      LOG.info("fetching: " + url);
    }

    ProtocolFactory factory = new ProtocolFactory(conf);
    Protocol protocol = factory.getProtocol(url);
    WebPage page = WebPage.newBuilder().build();

    ProtocolOutput protocolOutput = protocol.getProtocolOutput(url, page);

    if (!protocolOutput.getStatus().isSuccess()) {
      LOG.error(
          "Fetch failed with protocol status: "
              + ProtocolStatusUtils.getName(protocolOutput.getStatus().getCode())
              + ": "
              + ProtocolStatusUtils.getMessage(protocolOutput.getStatus()));
      return (-1);
    }
    Content content = protocolOutput.getContent();

    if (content == null) {
      LOG.error("No content for " + url);
      return (-1);
    }
    page.setBaseUrl(new org.apache.avro.util.Utf8(url));
    page.setContent(ByteBuffer.wrap(content.getContent()));

    if (force) {
      content.setContentType(contentType);
    } else {
      contentType = content.getContentType();
    }

    if (contentType == null) {
      LOG.error("Failed to determine content type!");
      return (-1);
    }

    page.setContentType(new Utf8(contentType));

    if (ParserJob.isTruncated(url, page)) {
      LOG.warn("Content is truncated, parse may fail!");
    }

    Parse parse = new ParseUtil(conf).parse(url, page);

    if (parse == null) {
      LOG.error("Problem with parse - check log");
      return (-1);
    }

    // Calculate the signature
    byte[] signature = SignatureFactory.getSignature(getConf()).calculate(page);

    if (LOG.isInfoEnabled()) {
      LOG.info("parsing: " + url);
      LOG.info("contentType: " + contentType);
      LOG.info("signature: " + StringUtil.toHexString(signature));
    }

    LOG.info("---------\nUrl\n---------------\n");
    System.out.print(url + "\n");
    LOG.info("---------\nMetadata\n---------\n");
    Map<CharSequence, ByteBuffer> metadata = page.getMetadata();
    StringBuffer sb = new StringBuffer();
    if (metadata != null) {
      Iterator<Entry<CharSequence, ByteBuffer>> iterator = metadata.entrySet().iterator();
      while (iterator.hasNext()) {
        Entry<CharSequence, ByteBuffer> entry = iterator.next();
        sb.append(entry.getKey().toString())
            .append(" : \t")
            .append(Bytes.toString(entry.getValue()))
            .append("\n");
      }
      System.out.print(sb.toString());
    }
    LOG.info("---------\nOutlinks\n---------\n");
    sb = new StringBuffer();
    for (Outlink l : parse.getOutlinks()) {
      sb.append("  outlink: ").append(l).append('\n');
    }
    System.out.print(sb.toString());
    if (page.getHeaders() != null) {
      LOG.info("---------\nHeaders\n---------\n");
      Map<CharSequence, CharSequence> headers = page.getHeaders();
      StringBuffer headersb = new StringBuffer();
      if (metadata != null) {
        Iterator<Entry<CharSequence, CharSequence>> iterator = headers.entrySet().iterator();
        while (iterator.hasNext()) {
          Entry<CharSequence, CharSequence> entry = iterator.next();
          headersb
              .append(entry.getKey().toString())
              .append(" : \t")
              .append(entry.getValue())
              .append("\n");
        }
        System.out.print(headersb.toString());
      }
    }
    if (dumpText) {
      LOG.info("---------\nParseText\n---------\n");
      System.out.print(parse.getText());
    }

    return 0;
  }