@Override public void execute(Tuple tuple) { byte[] content = tuple.getBinaryByField("content"); String url = tuple.getStringByField("url"); Metadata metadata = (Metadata) tuple.getValueByField("metadata"); // check that its content type is HTML // look at value found in HTTP headers boolean CT_OK = false; String httpCT = metadata.getFirstValue(HttpHeaders.CONTENT_TYPE); if (StringUtils.isNotBlank(httpCT)) { if (httpCT.toLowerCase().contains("html")) { CT_OK = true; } } // simply ignore cases where the content type has not been set // TODO sniff content with Tika? else { CT_OK = true; } if (!CT_OK) { String errorMessage = "Exception content-type " + httpCT + " for " + url; RuntimeException e = new RuntimeException(errorMessage); handleException(url, e, metadata, tuple, "content-type checking", errorMessage); return; } LOG.info("Parsing : starting {}", url); long start = System.currentTimeMillis(); String charset = getContentCharset(content, metadata); // get the robots tags from the fetch metadata RobotsTags robotsTags = new RobotsTags(metadata); Map<String, List<String>> slinks; String text; DocumentFragment fragment; try (ByteArrayInputStream bais = new ByteArrayInputStream(content)) { org.jsoup.nodes.Document jsoupDoc = Jsoup.parse(bais, charset, url); fragment = JSoupDOMBuilder.jsoup2HTML(jsoupDoc); // extracts the robots directives from the meta tags robotsTags.extractMetaTags(fragment); // store a normalised representation in metadata // so that the indexer is aware of it robotsTags.normaliseToMetadata(metadata); // do not extract the links if no follow has been set // and we are in strict mode if (robotsTags.isNoFollow() && robots_noFollow_strict) { slinks = new HashMap<>(0); } else { Elements links = jsoupDoc.select("a[href]"); slinks = new HashMap<>(links.size()); for (Element link : links) { // abs:href tells jsoup to return fully qualified domains // for // relative urls. // e.g.: /foo will resolve to http://shopstyle.com/foo String targetURL = link.attr("abs:href"); // nofollow boolean noFollow = "nofollow".equalsIgnoreCase(link.attr("rel")); // remove altogether if (noFollow && robots_noFollow_strict) { continue; } // link not specifically marked as no follow // but whole page is if (!noFollow && robotsTags.isNoFollow()) { noFollow = true; } String anchor = link.text(); if (StringUtils.isNotBlank(targetURL)) { // any existing anchors for the same target? List<String> anchors = slinks.get(targetURL); if (anchors == null) { anchors = new LinkedList<>(); slinks.put(targetURL, anchors); } // track the anchors only if no follow is false if (!noFollow && StringUtils.isNotBlank(anchor)) { anchors.add(anchor); } } } } text = jsoupDoc.body().text(); } catch (Throwable e) { String errorMessage = "Exception while parsing " + url + ": " + e; handleException(url, e, metadata, tuple, "content parsing", errorMessage); return; } // store identified charset in md metadata.setValue("parse.Content-Encoding", charset); long duration = System.currentTimeMillis() - start; LOG.info("Parsed {} in {} msec", url, duration); List<Outlink> outlinks = toOutlinks(url, metadata, slinks); ParseResult parse = new ParseResult(); parse.setOutlinks(outlinks); // parse data of the parent URL ParseData parseData = parse.get(url); parseData.setMetadata(metadata); parseData.setText(text); parseData.setContent(content); // apply the parse filters if any try { parseFilters.filter(url, content, fragment, parse); } catch (RuntimeException e) { String errorMessage = "Exception while running parse filters on " + url + ": " + e; handleException(url, e, metadata, tuple, "content filtering", errorMessage); return; } if (emitOutlinks) { for (Outlink outlink : parse.getOutlinks()) { collector.emit( StatusStreamName, tuple, new Values(outlink.getTargetURL(), outlink.getMetadata(), Status.DISCOVERED)); } } // emit each document/subdocument in the ParseResult object // there should be at least one ParseData item for the "parent" URL for (Map.Entry<String, ParseData> doc : parse) { ParseData parseDoc = doc.getValue(); collector.emit( tuple, new Values( doc.getKey(), parseDoc.getContent(), parseDoc.getMetadata(), parseDoc.getText())); } collector.ack(tuple); eventCounter.scope("tuple_success").incr(); }