// implements the filter-method which gives you access to important Objects // like NutchDocument public NutchDocument filter( NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) { LOG.info("-------->>>>> WE ARE IN THE INDExer-------------------"); String containsSem = "false"; containsSem = parse.getData().getMeta(WdcParser.META_CONTAINS_SEM); // we don't have to add the triples in a separate field as they are // already in the content field // String triples = ""; // triples = parse.getText(); // doc.add("triples", triples); // // check if the father contains sem data // boolean semFather = false; // try { // semFather = // Boolean.parseBoolean(datum.getMetaData().get(WdcParser.META_CONTAINS_SEM_FATHER).toString()); // // } catch (Exception e) { // LOG.error("CANNOT PROCESS THE FATHER SEM FIELD" + e.getMessage()); // } // adds the new field to the document doc.add("containsSem", containsSem); return doc; }
public NutchDocument filter( NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { if (this.addStaticFields == true) { for (Entry<String, String[]> entry : this.fields.entrySet()) { doc.add(entry.getKey(), entry.getValue()); } } return doc; }
/** * The {@link RelTagIndexingFilter} filter object. * * @param doc The {@link NutchDocument} object * @param url URL to be filtered for rel-tag's * @param page {@link WebPage} object relative to the URL * @return filtered NutchDocument */ @Override public NutchDocument filter(NutchDocument doc, String url, WebPage page) throws IndexingException { // Check if some Rel-Tags found, possibly put there by RelTagParser ByteBuffer bb = page.getFromMetadata(new Utf8(RelTagParser.REL_TAG)); if (bb != null) { String[] tags = Bytes.toString(bb).split("\t"); for (int i = 0; i < tags.length; i++) { doc.add("tag", tags[i]); } } return doc; }
public NutchDocument filter( NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { Text reprUrl = (Text) datum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY); String reprUrlString = reprUrl != null ? reprUrl.toString() : null; String urlString = url.toString(); String host = null; try { URL u; if (reprUrlString != null) { u = new URL(reprUrlString); } else { u = new URL(urlString); } host = u.getHost(); } catch (MalformedURLException e) { throw new IndexingException(e); } if (host != null) { doc.add("host", host); } doc.add("url", reprUrlString == null ? urlString : reprUrlString); // content String content = parse.getText(); if (MAX_CONTENT_LENGTH > -1 && content.length() > MAX_CONTENT_LENGTH) { content = content.substring(0, MAX_CONTENT_LENGTH); } doc.add("content", content); // title String title = parse.getData().getTitle(); if (title.length() > MAX_TITLE_LENGTH) { // truncate title if needed title = title.substring(0, MAX_TITLE_LENGTH); } if (title.length() > 0) { // NUTCH-1004 Do not index empty values for title field doc.add("title", title); } // add cached content/summary display policy, if available String caching = parse.getData().getMeta(Nutch.CACHING_FORBIDDEN_KEY); if (caching != null && !caching.equals(Nutch.CACHING_FORBIDDEN_NONE)) { doc.add("cache", caching); } // add timestamp when fetched, for deduplication doc.add("tstamp", new Date(datum.getFetchTime())); return doc; }