/*
   * (non-Javadoc)
   *
   * @see org.openid4java.discovery.html.HtmlParser#parse(java.lang.String,
   *      org.openid4java.discovery.html.HtmlResult)
   */
  public void parseHtml(String htmlData, HtmlResult result) throws DiscoveryException {
    if (DEBUG) _log.debug("Parsing HTML data:\n" + htmlData);

    HTMLDocumentImpl doc = this.parseDocument(htmlData);

    NodeList heads = doc.getElementsByTagName("head");
    if (heads.getLength() != 1)
      throw new DiscoveryException(
          "HTML response must have exactly one HEAD element, "
              + "found "
              + heads.getLength()
              + " : "
              + heads.toString(),
          OpenIDException.DISCOVERY_HTML_PARSE_ERROR);

    HTMLHeadElement head = (HTMLHeadElement) doc.getHead();
    NodeList linkElements = head.getElementsByTagName("LINK");
    for (int i = 0, len = linkElements.getLength(); i < len; i++) {
      HTMLLinkElement linkElement = (HTMLLinkElement) linkElements.item(i);
      setResult(linkElement.getRel(), linkElement.getHref(), result);
    }

    if (DEBUG) _log.debug("HTML discovery result:\n" + result);
  }
Exemplo n.º 2
0
  public ParseResult getParse(Content content) {
    String mimeType = content.getContentType();

    URL base;
    try {
      base = new URL(content.getBaseUrl());
    } catch (MalformedURLException e) {
      return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    }

    // get the right parser using the mime type as a clue
    Parser parser = tikaConfig.getParser(MediaType.parse(mimeType));
    byte[] raw = content.getContent();

    if (parser == null) {
      String message = "Can't retrieve Tika parser for mime-type " + mimeType;
      LOG.error(message);
      return new ParseStatus(ParseStatus.FAILED, message)
          .getEmptyParseResult(content.getUrl(), getConf());
    }

    LOG.debug("Using Tika parser " + parser.getClass().getName() + " for mime-type " + mimeType);

    Metadata tikamd = new Metadata();

    HTMLDocumentImpl doc = new HTMLDocumentImpl();
    doc.setErrorChecking(false);
    DocumentFragment root = doc.createDocumentFragment();
    DOMBuilder domhandler = new DOMBuilder(doc, root);
    ParseContext context = new ParseContext();
    try {
      parser.parse(new ByteArrayInputStream(raw), domhandler, tikamd, context);
    } catch (Exception e) {
      LOG.error("Error parsing " + content.getUrl(), e);
      return new ParseStatus(ParseStatus.FAILED, e.getMessage())
          .getEmptyParseResult(content.getUrl(), getConf());
    }

    HTMLMetaTags metaTags = new HTMLMetaTags();
    String text = "";
    String title = "";
    Outlink[] outlinks = new Outlink[0];
    org.apache.nutch.metadata.Metadata nutchMetadata = new org.apache.nutch.metadata.Metadata();

    // we have converted the sax events generated by Tika into a DOM object
    // so we can now use the usual HTML resources from Nutch
    // get meta directives
    HTMLMetaProcessor.getMetaTags(metaTags, root, base);
    if (LOG.isTraceEnabled()) {
      LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
    }

    // check meta directives
    if (!metaTags.getNoIndex()) { // okay to index
      StringBuffer sb = new StringBuffer();
      if (LOG.isTraceEnabled()) {
        LOG.trace("Getting text...");
      }
      utils.getText(sb, root); // extract text
      text = sb.toString();
      sb.setLength(0);
      if (LOG.isTraceEnabled()) {
        LOG.trace("Getting title...");
      }
      utils.getTitle(sb, root); // extract title
      title = sb.toString().trim();
    }

    if (!metaTags.getNoFollow()) { // okay to follow links
      ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks
      URL baseTag = utils.getBase(root);
      if (LOG.isTraceEnabled()) {
        LOG.trace("Getting links...");
      }
      utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
      outlinks = l.toArray(new Outlink[l.size()]);
      if (LOG.isTraceEnabled()) {
        LOG.trace("found " + outlinks.length + " outlinks in " + content.getUrl());
      }
    }

    // populate Nutch metadata with Tika metadata
    String[] TikaMDNames = tikamd.names();
    for (String tikaMDName : TikaMDNames) {
      if (tikaMDName.equalsIgnoreCase(Metadata.TITLE)) continue;
      // TODO what if multivalued?
      nutchMetadata.add(tikaMDName, tikamd.get(tikaMDName));
    }

    // no outlinks? try OutlinkExtractor e.g works for mime types where no
    // explicit markup for anchors

    if (outlinks.length == 0) {
      outlinks = OutlinkExtractor.getOutlinks(text, getConf());
    }

    ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
    if (metaTags.getRefresh()) {
      status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
      status.setArgs(
          new String[] {
            metaTags.getRefreshHref().toString(), Integer.toString(metaTags.getRefreshTime())
          });
    }
    ParseData parseData =
        new ParseData(status, title, outlinks, content.getMetadata(), nutchMetadata);
    ParseResult parseResult =
        ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));

    // run filters on parse
    ParseResult filteredParse = this.htmlParseFilters.filter(content, parseResult, metaTags, root);
    if (metaTags.getNoCache()) { // not okay to cache
      for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse)
        entry.getValue().getData().getParseMeta().set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
    }
    return filteredParse;
  }
  private static void setup() throws Exception {
    conf = NutchConfiguration.create();
    conf.setBoolean("parser.html.form.use_action", true);
    utils = new DOMContentUtils(conf);
    TikaParser tikaParser = new TikaParser();
    tikaParser.setConf(conf);
    Parser parser = tikaParser.getTikaConfig().getParser("text/html");
    for (int i = 0; i < testPages.length; i++) {
      Metadata tikamd = new Metadata();

      HTMLDocumentImpl doc = new HTMLDocumentImpl();
      doc.setErrorChecking(false);
      DocumentFragment root = doc.createDocumentFragment();
      DOMBuilder domhandler = new DOMBuilder(doc, root);
      ParseContext context = new ParseContext();
      // to add once available in Tika
      // context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
      try {
        parser.parse(
            new ByteArrayInputStream(testPages[i].getBytes()), domhandler, tikamd, context);
        testBaseHrefURLs[i] = new URL(testBaseHrefs[i]);
      } catch (Exception e) {
        e.printStackTrace();
        fail("caught exception: " + e);
      }
      testDOMs[i] = root;
      LSSerializerImpl lsi = new LSSerializerImpl();
      System.out.println("input " + i + ": '" + testPages[i] + "'");
      System.out.println("output " + i + ": '" + lsi.writeToString(root) + "'");
    }
    answerOutlinks =
        new Outlink[][] {
          // 0
          {
            new Outlink("http://www.nutch.org", "anchor"),
          },
          // 1
          {
            new Outlink("http://www.nutch.org/", "home"),
            new Outlink("http://www.nutch.org/docs/bot.html", "bots"),
          },
          // 2
          {
            new Outlink("http://www.nutch.org/", "separate this"),
            new Outlink("http://www.nutch.org/docs/ok", "from this"),
          },

          // 3
          {
            new Outlink("http://www.nutch.org/", "home"),
            new Outlink("http://www.nutch.org/docs/1", "1"),
            new Outlink("http://www.nutch.org/docs/2", "2"),
          },
          // 4
          {
            new Outlink("http://www.nutch.org/frames/top.html", ""),
            new Outlink("http://www.nutch.org/frames/left.html", ""),
            new Outlink("http://www.nutch.org/frames/invalid.html", ""),
            new Outlink("http://www.nutch.org/frames/right.html", ""),
          },
          // 5
          {
            new Outlink("http://www.nutch.org/maps/logo.gif", ""),
            new Outlink("http://www.nutch.org/index.html", ""),
            new Outlink("http://www.nutch.org/maps/#bottom", ""),
            new Outlink("http://www.nutch.org/bot.html", ""),
            new Outlink("http://www.nutch.org/docs/index.html", "")
          },
          // 6
          {
            new Outlink("http://www.nutch.org/index.html", "whitespace test"),
          },
          // 7
          {},
          // 8
          {
            new Outlink("http://www.nutch.org/dummy.jsp", "test2"),
          },
          // 9
          {},
          // 10
          {
            new Outlink("http://www.nutch.org/;x", "anchor1"),
            new Outlink("http://www.nutch.org/g;x", "anchor2"),
            new Outlink("http://www.nutch.org/g;x?y#s", "anchor3")
          },
          // 11
          {
            // this is tricky - see RFC3986 section 5.4.1 example 7
            new Outlink("http://www.nutch.org/g", "anchor1"),
            new Outlink("http://www.nutch.org/g?y#s", "anchor2"),
            new Outlink("http://www.nutch.org/;something?y=1", "anchor3"),
            new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"),
            new Outlink("http://www.nutch.org/;something?y=1;somethingelse", "anchor5")
          }
        };
  }