コード例 #1
0
  private Set parseSingleSource(String source) throws IOException {
    MockArchivalUnit mau = new MockArchivalUnit();
    LinkExtractor ue = new RegexpCssLinkExtractor();
    mau.setLinkExtractor("text/css", ue);
    MockCachedUrl mcu = new MockCachedUrl("http://www.example.com", mau);
    mcu.setContent(source);

    cb.reset();
    extractor.extractUrls(mau, new StringInputStream(source), ENC, "http://www.example.com", cb);
    return cb.getFoundUrls();
  }
コード例 #2
0
  public void testRelativeLinksWithLeadingSlash() throws IOException {
    String url1 = "http://www.example.com/blah/branch1/index.html";
    String url2 = "http://www.example.com/blah/branch2/index.html";
    String url3 = "http://www.example.com/journals/american_imago/toc/aim60.1.html";
    String url4 = "http://www.example.com/css/foo.css";
    String url5 = "http://www.example.com/javascript/bar.js";
    String source =
        "<html><head><title>Test</title></head><body>"
            + "<a href= branch1/index.html>link1</a>"
            + "Filler, with <b>bold</b> tags and<i>others</i>"
            + "<a href=\" branch2/index.html\">link2</a>"
            + "<a href =\" /journals/american_imago/toc/aim60.1.html\">"
            + "<link rel=\"stylesheet\" href=\"/css/foo.css\" >"
            + "<script type=\"text/javascript\" src=\"/javascript/bar.js\"></script>"
            + "Number 1, Spring 2003</a>";

    MockCachedUrl mcu = new MockCachedUrl("http://www.example.com/blah/");
    mcu.setContent(source);

    extractor.extractUrls(
        mau, new StringInputStream(source), ENC, "http://www.example.com/blah/", cb);

    Set expected = SetUtil.set(url1, url2, url3, url4, url5);
    assertEquals(expected, cb.getFoundUrls());
  }
コード例 #3
0
  private void singleTagParse(
      String url, String startTag, String endTag, ArchivalUnit au, boolean shouldParse)
      throws IOException {
    MockCachedUrl mcu = new MockCachedUrl("http://www.example.com");
    String content = makeContent(url, startTag, endTag);
    mcu.setContent(content);

    MyLinkExtractorCallback cb = new MyLinkExtractorCallback();
    extractor.extractUrls(mau, new StringInputStream(content), ENC, "http://www.example.com", cb);

    if (shouldParse) {
      Set expected = SetUtil.set(url);
      assertEquals("Misparsed: " + content, expected, cb.getFoundUrls());
    } else {
      Set expected = SetUtil.set();
      assertEquals("Misparsed: " + content, expected, cb.getFoundUrls());
    }
  }
コード例 #4
0
  public void testDoCrawlImageWithSrcInAltTagAfterSrcProper() throws IOException {
    String url = "http://www.example.com/link3.html";

    String source =
        "<html><head><title>Test</title></head><body>" + "<img src=" + url + " alt=src>link3</a>";

    MockCachedUrl mcu = new MockCachedUrl(startUrl);
    mcu.setContent(source);

    extractor.extractUrls(mau, new StringInputStream(source), ENC, startUrl, cb);

    Set expected = SetUtil.set(url);
    assertEquals(expected, cb.getFoundUrls());
  }
コード例 #5
0
  public void testRelativeLinksWithSameName() throws IOException {
    String url1 = "http://www.example.com/branch1/index.html";
    String url2 = "http://www.example.com/branch2/index.html";

    String source =
        "<html><head><title>Test</title></head><body>"
            + "<a href=branch1/index.html>link1</a>"
            + "Filler, with <b>bold</b> tags and<i>others</i>"
            + "<a href=branch2/index.html>link2</a>";

    MockCachedUrl mcu = new MockCachedUrl("http://www.example.com");
    mcu.setContent(source);

    extractor.extractUrls(mau, new StringInputStream(source), ENC, "http://www.example.com", cb);

    Set expected = SetUtil.set(url1, url2);
    assertEquals(expected, cb.getFoundUrls());
  }