/* * (non-Javadoc) * * @see org.openid4java.discovery.html.HtmlParser#parse(java.lang.String, * org.openid4java.discovery.html.HtmlResult) */ public void parseHtml(String htmlData, HtmlResult result) throws DiscoveryException { if (DEBUG) _log.debug("Parsing HTML data:\n" + htmlData); HTMLDocumentImpl doc = this.parseDocument(htmlData); NodeList heads = doc.getElementsByTagName("head"); if (heads.getLength() != 1) throw new DiscoveryException( "HTML response must have exactly one HEAD element, " + "found " + heads.getLength() + " : " + heads.toString(), OpenIDException.DISCOVERY_HTML_PARSE_ERROR); HTMLHeadElement head = (HTMLHeadElement) doc.getHead(); NodeList linkElements = head.getElementsByTagName("LINK"); for (int i = 0, len = linkElements.getLength(); i < len; i++) { HTMLLinkElement linkElement = (HTMLLinkElement) linkElements.item(i); setResult(linkElement.getRel(), linkElement.getHref(), result); } if (DEBUG) _log.debug("HTML discovery result:\n" + result); }
public ParseResult getParse(Content content) { String mimeType = content.getContentType(); URL base; try { base = new URL(content.getBaseUrl()); } catch (MalformedURLException e) { return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf()); } // get the right parser using the mime type as a clue Parser parser = tikaConfig.getParser(MediaType.parse(mimeType)); byte[] raw = content.getContent(); if (parser == null) { String message = "Can't retrieve Tika parser for mime-type " + mimeType; LOG.error(message); return new ParseStatus(ParseStatus.FAILED, message) .getEmptyParseResult(content.getUrl(), getConf()); } LOG.debug("Using Tika parser " + parser.getClass().getName() + " for mime-type " + mimeType); Metadata tikamd = new Metadata(); HTMLDocumentImpl doc = new HTMLDocumentImpl(); doc.setErrorChecking(false); DocumentFragment root = doc.createDocumentFragment(); DOMBuilder domhandler = new DOMBuilder(doc, root); ParseContext context = new ParseContext(); try { parser.parse(new ByteArrayInputStream(raw), domhandler, tikamd, context); } catch (Exception e) { LOG.error("Error parsing " + content.getUrl(), e); return new ParseStatus(ParseStatus.FAILED, e.getMessage()) .getEmptyParseResult(content.getUrl(), getConf()); } HTMLMetaTags metaTags = new HTMLMetaTags(); String text = ""; String title = ""; Outlink[] outlinks = new Outlink[0]; org.apache.nutch.metadata.Metadata nutchMetadata = new org.apache.nutch.metadata.Metadata(); // we have converted the sax events generated by Tika into a DOM object // so we can now use the usual HTML resources from Nutch // get meta directives HTMLMetaProcessor.getMetaTags(metaTags, root, base); if (LOG.isTraceEnabled()) { LOG.trace("Meta tags for " + base + ": " + metaTags.toString()); } // check meta directives if (!metaTags.getNoIndex()) { // okay to index StringBuffer sb = new StringBuffer(); if (LOG.isTraceEnabled()) { LOG.trace("Getting text..."); } utils.getText(sb, root); // extract text text = sb.toString(); sb.setLength(0); if (LOG.isTraceEnabled()) { LOG.trace("Getting title..."); } utils.getTitle(sb, root); // extract title title = sb.toString().trim(); } if (!metaTags.getNoFollow()) { // okay to follow links ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks URL baseTag = utils.getBase(root); if (LOG.isTraceEnabled()) { LOG.trace("Getting links..."); } utils.getOutlinks(baseTag != null ? baseTag : base, l, root); outlinks = l.toArray(new Outlink[l.size()]); if (LOG.isTraceEnabled()) { LOG.trace("found " + outlinks.length + " outlinks in " + content.getUrl()); } } // populate Nutch metadata with Tika metadata String[] TikaMDNames = tikamd.names(); for (String tikaMDName : TikaMDNames) { if (tikaMDName.equalsIgnoreCase(Metadata.TITLE)) continue; // TODO what if multivalued? nutchMetadata.add(tikaMDName, tikamd.get(tikaMDName)); } // no outlinks? try OutlinkExtractor e.g works for mime types where no // explicit markup for anchors if (outlinks.length == 0) { outlinks = OutlinkExtractor.getOutlinks(text, getConf()); } ParseStatus status = new ParseStatus(ParseStatus.SUCCESS); if (metaTags.getRefresh()) { status.setMinorCode(ParseStatus.SUCCESS_REDIRECT); status.setArgs( new String[] { metaTags.getRefreshHref().toString(), Integer.toString(metaTags.getRefreshTime()) }); } ParseData parseData = new ParseData(status, title, outlinks, content.getMetadata(), nutchMetadata); ParseResult parseResult = ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData)); // run filters on parse ParseResult filteredParse = this.htmlParseFilters.filter(content, parseResult, metaTags, root); if (metaTags.getNoCache()) { // not okay to cache for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse) entry.getValue().getData().getParseMeta().set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy); } return filteredParse; }
private static void setup() throws Exception { conf = NutchConfiguration.create(); conf.setBoolean("parser.html.form.use_action", true); utils = new DOMContentUtils(conf); TikaParser tikaParser = new TikaParser(); tikaParser.setConf(conf); Parser parser = tikaParser.getTikaConfig().getParser("text/html"); for (int i = 0; i < testPages.length; i++) { Metadata tikamd = new Metadata(); HTMLDocumentImpl doc = new HTMLDocumentImpl(); doc.setErrorChecking(false); DocumentFragment root = doc.createDocumentFragment(); DOMBuilder domhandler = new DOMBuilder(doc, root); ParseContext context = new ParseContext(); // to add once available in Tika // context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE); try { parser.parse( new ByteArrayInputStream(testPages[i].getBytes()), domhandler, tikamd, context); testBaseHrefURLs[i] = new URL(testBaseHrefs[i]); } catch (Exception e) { e.printStackTrace(); fail("caught exception: " + e); } testDOMs[i] = root; LSSerializerImpl lsi = new LSSerializerImpl(); System.out.println("input " + i + ": '" + testPages[i] + "'"); System.out.println("output " + i + ": '" + lsi.writeToString(root) + "'"); } answerOutlinks = new Outlink[][] { // 0 { new Outlink("http://www.nutch.org", "anchor"), }, // 1 { new Outlink("http://www.nutch.org/", "home"), new Outlink("http://www.nutch.org/docs/bot.html", "bots"), }, // 2 { new Outlink("http://www.nutch.org/", "separate this"), new Outlink("http://www.nutch.org/docs/ok", "from this"), }, // 3 { new Outlink("http://www.nutch.org/", "home"), new Outlink("http://www.nutch.org/docs/1", "1"), new Outlink("http://www.nutch.org/docs/2", "2"), }, // 4 { new Outlink("http://www.nutch.org/frames/top.html", ""), new Outlink("http://www.nutch.org/frames/left.html", ""), new Outlink("http://www.nutch.org/frames/invalid.html", ""), new Outlink("http://www.nutch.org/frames/right.html", ""), }, // 5 { new Outlink("http://www.nutch.org/maps/logo.gif", ""), new Outlink("http://www.nutch.org/index.html", ""), new Outlink("http://www.nutch.org/maps/#bottom", ""), new Outlink("http://www.nutch.org/bot.html", ""), new Outlink("http://www.nutch.org/docs/index.html", "") }, // 6 { new Outlink("http://www.nutch.org/index.html", "whitespace test"), }, // 7 {}, // 8 { new Outlink("http://www.nutch.org/dummy.jsp", "test2"), }, // 9 {}, // 10 { new Outlink("http://www.nutch.org/;x", "anchor1"), new Outlink("http://www.nutch.org/g;x", "anchor2"), new Outlink("http://www.nutch.org/g;x?y#s", "anchor3") }, // 11 { // this is tricky - see RFC3986 section 5.4.1 example 7 new Outlink("http://www.nutch.org/g", "anchor1"), new Outlink("http://www.nutch.org/g?y#s", "anchor2"), new Outlink("http://www.nutch.org/;something?y=1", "anchor3"), new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"), new Outlink("http://www.nutch.org/;something?y=1;somethingelse", "anchor5") } }; }