@Test
  public void testRelativeLinkWithLocationUrl() throws Exception {
    // Read in test data from test/resources
    String html = readFromFile("parser-files/relative-urls.html");

    // Create FetchedDatum using data
    String url = "http://olddomain.com/relative-urls.html";
    String location = "http://newdomain.com";

    String contentType = "text/html; charset=utf-8";
    HttpHeaders headers = new HttpHeaders();
    headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
    headers.add(HttpHeaderNames.CONTENT_LOCATION, location);
    ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
    FetchedDatum fetchedDatum =
        new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);

    // Call parser.parse
    SimpleParser parser = new SimpleParser();
    ParsedDatum parsedDatum = parser.parse(fetchedDatum);

    // Verify outlink is correct.
    Outlink[] outlinks = parsedDatum.getOutlinks();
    Assert.assertEquals(2, outlinks.length);

    Assert.assertEquals("http://newdomain.com/link1", outlinks[0].getToUrl());
    Assert.assertEquals("link1", outlinks[0].getAnchor());
    // TODO KKr - reenable this test when Tika changes are submitted:
    // Assert.assertEquals("nofollow", outlinks[0].getRelAttributes());
    Assert.assertEquals("http://domain.com/link2", outlinks[1].getToUrl());
    Assert.assertEquals("link2", outlinks[1].getAnchor());
  }
  @Test
  public void testHtmlWithTags() throws Exception {
    final String htmlText =
        "<html><head><title>Title</title></head>" + "<body><p>this is a test</p></body></html>";

    // Create FetchedDatum using data
    String url = "http://domain.com/page.html";
    String contentType = "text/html; charset=utf-8";
    HttpHeaders headers = new HttpHeaders();
    headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
    ContentBytes content = new ContentBytes(htmlText.getBytes("utf-8"));
    FetchedDatum fetchedDatum =
        new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);

    // Call parser.parse
    SimpleParser parser = new SimpleParser(new ParserPolicy(), true);
    ParsedDatum parsedDatum = parser.parse(fetchedDatum);

    // Now take the resulting HTML, process it using Dom4J
    SAXReader reader = new SAXReader(new Parser());
    reader.setEncoding("UTF-8");
    String htmlWithMarkup = parsedDatum.getParsedText();
    Document doc = reader.read(new StringInputStream(htmlWithMarkup));

    // We have to do helicopter stunts since HTML has a global namespace on it, set
    // at the <html> element level.
    XPath xpath = DocumentHelper.createXPath("/xhtml:html/xhtml:body/xhtml:p");
    Map<String, String> namespaceUris = new HashMap<String, String>();
    namespaceUris.put("xhtml", "http://www.w3.org/1999/xhtml");
    xpath.setNamespaceURIs(namespaceUris);

    Node paragraphNode = xpath.selectSingleNode(doc);
    Assert.assertNotNull(paragraphNode);
    Assert.assertEquals("this is a test", paragraphNode.getText());
  }
  @Test
  public void testRelativeLinkWithBaseUrl() throws Exception {
    // Read in test data from test/resources
    String html = readFromFile("parser-files/base-url.html");

    // Create FetchedDatum using data
    String url = "http://olddomain.com/base-url.html";
    String contentType = "text/html; charset=utf-8";
    HttpHeaders headers = new HttpHeaders();
    headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
    ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
    FetchedDatum fetchedDatum =
        new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);

    // Call parser.parse
    SimpleParser parser = new SimpleParser();
    ParsedDatum parsedDatum = parser.parse(fetchedDatum);

    // Verify outlink is correct.
    Outlink[] outlinks = parsedDatum.getOutlinks();
    Assert.assertEquals(2, outlinks.length);

    // TODO KKr - reenable this test when Tika parser calls my handler with
    // the <base> element, which is needed to correctly resolve relative links.
    // Assert.assertEquals("http://newdomain.com/link", outlinks[0].getToUrl());
    Assert.assertEquals("link1", outlinks[0].getAnchor());
    Assert.assertEquals("http://domain.com/link", outlinks[1].getToUrl());
    Assert.assertEquals("link2", outlinks[1].getAnchor());
  }
  @Test
  public void testLanguageDetectionHttpEquiv() throws Exception {
    // Read in test data from test/resources
    String html = readFromFile("parser-files/lang-http-equiv.html");

    // Create FetchedDatum using data
    String url = "http://domain.com/lang-dc.html";
    String contentType = "text/html; charset=utf-8";
    HttpHeaders headers = new HttpHeaders();
    headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
    headers.add(HttpHeaderNames.CONTENT_LANGUAGE, "en");

    ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
    FetchedDatum fetchedDatum =
        new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);

    // Call parser.parse
    SimpleParser parser = new SimpleParser();
    ParsedDatum parsedDatum = parser.parse(fetchedDatum);

    // Verify content is correct
    Assert.assertEquals("SimpleHttpEquiv", parsedDatum.getTitle());

    compareTermsInStrings("SimpleHttpEquiv Content", parsedDatum.getParsedText());

    Assert.assertEquals("ja", parsedDatum.getLanguage());
  }
  @Test
  public void testExtractingObjectTag() throws Exception {
    final String html =
        "<html><head><title>Title</title></head>"
            + "<body><object data=\"http://domain.com/song.mid\" /></body></html>";

    // Create FetchedDatum using data
    String url = "http://domain.com/music.html";
    String contentType = "text/html; charset=utf-8";
    HttpHeaders headers = new HttpHeaders();
    headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
    ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
    FetchedDatum fetchedDatum =
        new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);

    // Call parser.parse
    ParserPolicy policy =
        new ParserPolicy(
            ParserPolicy.NO_MAX_PARSE_DURATION,
            BaseLinkExtractor.ALL_LINK_TAGS,
            BaseLinkExtractor.ALL_LINK_ATTRIBUTE_TYPES);
    SimpleParser parser =
        new SimpleParser(new SimpleContentExtractor(), new SimpleLinkExtractor(), policy, true);
    ParsedDatum parsedDatum = parser.parse(fetchedDatum);

    // Verify outlinks are correct
    Outlink[] outlinks = parsedDatum.getOutlinks();
    Assert.assertEquals(1, outlinks.length);
    Assert.assertEquals("http://domain.com/song.mid", outlinks[0].getToUrl());
  }
  @Test
  public void testDefaultLinkTypes() throws Exception {
    // Read in test data from test/resources
    String html = readFromFile("parser-files/all-link-types.html");

    // Create FetchedDatum using data
    String url = "http://domain.com/all-link-types.html";

    String contentType = "text/html; charset=utf-8";
    HttpHeaders headers = new HttpHeaders();
    headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
    ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
    FetchedDatum fetchedDatum =
        new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);

    // Call parser.parse
    SimpleParser parser = new SimpleParser();
    ParsedDatum parsedDatum = parser.parse(fetchedDatum);

    // Verify outlinks are correct (and we only get the a href ones).
    Outlink[] outlinks = parsedDatum.getOutlinks();
    Assert.assertEquals(2, outlinks.length);

    Assert.assertEquals("http://newdomain.com/link1", outlinks[0].getToUrl());
    Assert.assertEquals("link1", outlinks[0].getAnchor());
    Assert.assertEquals("http://domain.com/link2", outlinks[1].getToUrl());
    Assert.assertEquals("link2", outlinks[1].getAnchor());
  }
  @SuppressWarnings("serial")
  @Test
  public void testSomeLinkTypes() throws Exception {
    // Read in test data from test/resources
    String html = readFromFile("parser-files/all-link-types.html");

    // Create FetchedDatum using data
    String url = "http://domain.com/all-link-types.html";

    String contentType = "text/html; charset=utf-8";
    HttpHeaders headers = new HttpHeaders();
    headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
    ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
    FetchedDatum fetchedDatum =
        new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);

    // Call parser.parse
    Set<String> linkTags =
        new HashSet<String>() {
          {
            add("a");
            add("img");
            add("link");
          }
        };

    Set<String> linkAttributeTypes =
        new HashSet<String>() {
          {
            add("href");
            add("src");
          }
        };

    ParserPolicy policy =
        new ParserPolicy(ParserPolicy.DEFAULT_MAX_PARSE_DURATION, linkTags, linkAttributeTypes);
    SimpleParser parser = new SimpleParser(policy);
    ParsedDatum parsedDatum = parser.parse(fetchedDatum);

    // Verify outlinks are correct (and we only get the a href ones).
    Outlink[] outlinks = parsedDatum.getOutlinks();
    Assert.assertEquals(4, outlinks.length);

    Assert.assertEquals("http://newdomain.com/favicon.ico", outlinks[0].getToUrl());
    Assert.assertEquals("http://newdomain.com/link1", outlinks[1].getToUrl());
    Assert.assertEquals("link1", outlinks[1].getAnchor());
    Assert.assertEquals("http://domain.com/link2", outlinks[2].getToUrl());
    Assert.assertEquals("link2", outlinks[2].getAnchor());
    Assert.assertEquals("http://newdomain.com/giant-prawn.jpg", outlinks[3].getToUrl());
  }
  @Test
  public void testLinkExtractorWithMetaTags() throws Exception {
    String html = readFromFile("parser-files/meta-nofollow.html");

    String url = "http://domain.com/meta-nofollow.html";
    String contentType = "text/html; charset=utf-8";
    HttpHeaders headers = new HttpHeaders();
    headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
    ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
    FetchedDatum fetchedDatum =
        new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);

    ParserPolicy policy = new ParserPolicy(Integer.MAX_VALUE);
    SimpleParser parser = new SimpleParser(policy);
    ParsedDatum parsedDatum = parser.parse(fetchedDatum);

    // Verify we got no URLs
    Assert.assertEquals(0, parsedDatum.getOutlinks().length);
  }
  @Test
  public void testAllLinkTypes() throws Exception {
    // Read in test data from test/resources
    String html = readFromFile("parser-files/all-link-types.html");

    // Create FetchedDatum using data
    String url = "http://domain.com/all-link-types.html";

    String contentType = "text/html; charset=utf-8";
    HttpHeaders headers = new HttpHeaders();
    headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
    ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
    FetchedDatum fetchedDatum =
        new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);

    // Call parser.parse
    ParserPolicy policy =
        new ParserPolicy(
            ParserPolicy.DEFAULT_MAX_PARSE_DURATION,
            BaseLinkExtractor.ALL_LINK_TAGS,
            BaseLinkExtractor.ALL_LINK_ATTRIBUTE_TYPES);
    SimpleParser parser = new SimpleParser(policy);
    ParsedDatum parsedDatum = parser.parse(fetchedDatum);

    // Verify outlinks are correct (and we only get the a href ones).
    Outlink[] outlinks = parsedDatum.getOutlinks();
    Assert.assertEquals(7, outlinks.length);

    Assert.assertEquals("http://newdomain.com/favicon.ico", outlinks[0].getToUrl());
    Assert.assertEquals("http://newdomain.com/link1", outlinks[1].getToUrl());
    Assert.assertEquals("link1", outlinks[1].getAnchor());
    Assert.assertEquals("http://domain.com/link2", outlinks[2].getToUrl());
    Assert.assertEquals("link2", outlinks[2].getAnchor());
    Assert.assertEquals("http://newdomain.com/giant-prawn.jpg", outlinks[3].getToUrl());
    Assert.assertEquals(
        "http://en.wikipedia.org/wiki/Australia's_Big_Things", outlinks[4].getToUrl());
    Assert.assertEquals("http://newdomain.com/giant-dog.jpg", outlinks[5].getToUrl());
    Assert.assertEquals(
        "http://www.brucelawson.co.uk/index.php/2005/stupid-stock-photography/",
        outlinks[6].getToUrl());
  }
示例#10
0
  @SuppressWarnings("serial")
  @Test
  public void testCustomContentExtractor() throws Exception {
    String html = readFromFile("parser-files/simple-content.html");

    String url = "http://domain.com/simple-content.html";
    String contentType = "text/html; charset=utf-8";
    HttpHeaders headers = new HttpHeaders();
    headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
    ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
    FetchedDatum fetchedDatum =
        new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);

    SimpleParser parser =
        new SimpleParser(
            new BaseContentExtractor() {

              @Override
              public String getContent() {
                return "Custom";
              }
            },
            new BaseLinkExtractor() {

              @Override
              public Outlink[] getLinks() {
                return new Outlink[0];
              }
            },
            new ParserPolicy());

    ParsedDatum parsedDatum = parser.parse(fetchedDatum);

    // Verify content is correct
    Assert.assertEquals("Simple", parsedDatum.getTitle());

    compareTermsInStrings("Custom", parsedDatum.getParsedText());
  }