Ejemplo n.º 1
0
  // implements the filter-method which gives you access to important Objects
  // like NutchDocument
  public NutchDocument filter(
      NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) {
    LOG.info("-------->>>>> WE ARE IN THE INDExer-------------------");

    String containsSem = "false";

    containsSem = parse.getData().getMeta(WdcParser.META_CONTAINS_SEM);

    // we don't have to add the triples in a separate field as they are
    // already in the content field
    // String triples = "";
    // triples = parse.getText();
    // doc.add("triples", triples);

    // // check if the father contains sem data
    // boolean semFather = false;
    // try {
    // semFather =
    // Boolean.parseBoolean(datum.getMetaData().get(WdcParser.META_CONTAINS_SEM_FATHER).toString());
    //
    // } catch (Exception e) {
    // LOG.error("CANNOT PROCESS THE FATHER SEM FIELD" + e.getMessage());
    // }

    // adds the new field to the document
    doc.add("containsSem", containsSem);
    return doc;
  }
Ejemplo n.º 2
0
  public NutchDocument filter(
      NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
      throws IndexingException {

    if (this.addStaticFields == true) {
      for (Entry<String, String[]> entry : this.fields.entrySet()) {
        doc.add(entry.getKey(), entry.getValue());
      }
    }
    return doc;
  }
  /**
   * The {@link RelTagIndexingFilter} filter object.
   *
   * @param doc The {@link NutchDocument} object
   * @param url URL to be filtered for rel-tag's
   * @param page {@link WebPage} object relative to the URL
   * @return filtered NutchDocument
   */
  @Override
  public NutchDocument filter(NutchDocument doc, String url, WebPage page)
      throws IndexingException {
    // Check if some Rel-Tags found, possibly put there by RelTagParser
    ByteBuffer bb = page.getFromMetadata(new Utf8(RelTagParser.REL_TAG));

    if (bb != null) {
      String[] tags = Bytes.toString(bb).split("\t");
      for (int i = 0; i < tags.length; i++) {
        doc.add("tag", tags[i]);
      }
    }
    return doc;
  }
Ejemplo n.º 4
0
  public NutchDocument filter(
      NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
      throws IndexingException {
    Text reprUrl = (Text) datum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
    String reprUrlString = reprUrl != null ? reprUrl.toString() : null;
    String urlString = url.toString();

    String host = null;
    try {
      URL u;
      if (reprUrlString != null) {
        u = new URL(reprUrlString);
      } else {
        u = new URL(urlString);
      }
      host = u.getHost();
    } catch (MalformedURLException e) {
      throw new IndexingException(e);
    }

    if (host != null) {
      doc.add("host", host);
    }

    doc.add("url", reprUrlString == null ? urlString : reprUrlString);

    // content
    String content = parse.getText();
    if (MAX_CONTENT_LENGTH > -1 && content.length() > MAX_CONTENT_LENGTH) {
      content = content.substring(0, MAX_CONTENT_LENGTH);
    }
    doc.add("content", content);

    // title
    String title = parse.getData().getTitle();
    if (title.length() > MAX_TITLE_LENGTH) { // truncate title if needed
      title = title.substring(0, MAX_TITLE_LENGTH);
    }

    if (title.length() > 0) {
      // NUTCH-1004 Do not index empty values for title field
      doc.add("title", title);
    }

    // add cached content/summary display policy, if available
    String caching = parse.getData().getMeta(Nutch.CACHING_FORBIDDEN_KEY);
    if (caching != null && !caching.equals(Nutch.CACHING_FORBIDDEN_NONE)) {
      doc.add("cache", caching);
    }

    // add timestamp when fetched, for deduplication
    doc.add("tstamp", new Date(datum.getFetchTime()));

    return doc;
  }