public void testRelativeLinksWithLeadingSlash() throws IOException {
    String url1 = "http://www.example.com/blah/branch1/index.html";
    String url2 = "http://www.example.com/blah/branch2/index.html";
    String url3 = "http://www.example.com/journals/american_imago/toc/aim60.1.html";
    String url4 = "http://www.example.com/css/foo.css";
    String url5 = "http://www.example.com/javascript/bar.js";
    String source =
        "<html><head><title>Test</title></head><body>"
            + "<a href= branch1/index.html>link1</a>"
            + "Filler, with <b>bold</b> tags and<i>others</i>"
            + "<a href=\" branch2/index.html\">link2</a>"
            + "<a href =\" /journals/american_imago/toc/aim60.1.html\">"
            + "<link rel=\"stylesheet\" href=\"/css/foo.css\" >"
            + "<script type=\"text/javascript\" src=\"/javascript/bar.js\"></script>"
            + "Number 1, Spring 2003</a>";

    MockCachedUrl mcu = new MockCachedUrl("http://www.example.com/blah/");
    mcu.setContent(source);

    extractor.extractUrls(
        mau, new StringInputStream(source), ENC, "http://www.example.com/blah/", cb);

    Set expected = SetUtil.set(url1, url2, url3, url4, url5);
    assertEquals(expected, cb.getFoundUrls());
  }
 public void testThrowsOnNullCallback() throws IOException {
   try {
     extractor.extractUrls(
         mau, new StringInputStream("blah"), ENC, "http://www.example.com/", null);
     fail("Calling extractUrls with a null callback should have thrown");
   } catch (IllegalArgumentException iae) {
   }
 }
 public void testThrowsOnNullSourceUrl() throws IOException {
   try {
     extractor.extractUrls(
         mau, new StringInputStream("Blah"), ENC, null, new MyLinkExtractorCallback());
     fail("Calling extractUrls with a null CachedUrl should have thrown");
   } catch (IllegalArgumentException iae) {
   }
 }
  private Set parseSingleSource(String source) throws IOException {
    MockArchivalUnit mau = new MockArchivalUnit();
    LinkExtractor ue = new RegexpCssLinkExtractor();
    mau.setLinkExtractor("text/css", ue);
    MockCachedUrl mcu = new MockCachedUrl("http://www.example.com", mau);
    mcu.setContent(source);

    cb.reset();
    extractor.extractUrls(mau, new StringInputStream(source), ENC, "http://www.example.com", cb);
    return cb.getFoundUrls();
  }
  public void testDoCrawlImageWithSrcInAltTagAfterSrcProper() throws IOException {
    String url = "http://www.example.com/link3.html";

    String source =
        "<html><head><title>Test</title></head><body>" + "<img src=" + url + " alt=src>link3</a>";

    MockCachedUrl mcu = new MockCachedUrl(startUrl);
    mcu.setContent(source);

    extractor.extractUrls(mau, new StringInputStream(source), ENC, startUrl, cb);

    Set expected = SetUtil.set(url);
    assertEquals(expected, cb.getFoundUrls());
  }
  public void testRelativeLinksWithSameName() throws IOException {
    String url1 = "http://www.example.com/branch1/index.html";
    String url2 = "http://www.example.com/branch2/index.html";

    String source =
        "<html><head><title>Test</title></head><body>"
            + "<a href=branch1/index.html>link1</a>"
            + "Filler, with <b>bold</b> tags and<i>others</i>"
            + "<a href=branch2/index.html>link2</a>";

    MockCachedUrl mcu = new MockCachedUrl("http://www.example.com");
    mcu.setContent(source);

    extractor.extractUrls(mau, new StringInputStream(source), ENC, "http://www.example.com", cb);

    Set expected = SetUtil.set(url1, url2);
    assertEquals(expected, cb.getFoundUrls());
  }
  private void singleTagParse(
      String url, String startTag, String endTag, ArchivalUnit au, boolean shouldParse)
      throws IOException {
    MockCachedUrl mcu = new MockCachedUrl("http://www.example.com");
    String content = makeContent(url, startTag, endTag);
    mcu.setContent(content);

    MyLinkExtractorCallback cb = new MyLinkExtractorCallback();
    extractor.extractUrls(mau, new StringInputStream(content), ENC, "http://www.example.com", cb);

    if (shouldParse) {
      Set expected = SetUtil.set(url);
      assertEquals("Misparsed: " + content, expected, cb.getFoundUrls());
    } else {
      Set expected = SetUtil.set();
      assertEquals("Misparsed: " + content, expected, cb.getFoundUrls());
    }
  }
 public void testGetAttribute() throws IOException {
   // no value found
   assertEquals(null, extractor.getAttributeValue("href", "a bar=foo"));
   assertEquals(null, extractor.getAttributeValue("href", "a href"));
   assertEquals(null, extractor.getAttributeValue("href", "a href="));
   assertEquals(null, extractor.getAttributeValue("href", "a href= "));
   // find proper attribute
   assertEquals("foo", extractor.getAttributeValue("tag", "a tag=foo tag=bar"));
   assertEquals("bar", extractor.getAttributeValue("tag", "a ta=foo tag=bar"));
   assertEquals("bar", extractor.getAttributeValue("tag", "a xy=foo\n tag=bar"));
   // whitespace
   assertEquals("foo", extractor.getAttributeValue("href", "a href=foo"));
   assertEquals("foo", extractor.getAttributeValue("href", "a href =foo"));
   assertEquals("foo", extractor.getAttributeValue("href", "a href = foo"));
   assertEquals("foo", extractor.getAttributeValue("href", "a href = foo\n"));
   assertEquals("foo", extractor.getAttributeValue("href", "a href= foo"));
   assertEquals("foo", extractor.getAttributeValue("href", "a href\t  = \n foo"));
   // quoted strings & whitespace
   assertEquals("foo", extractor.getAttributeValue("href", "a href=\"foo\""));
   assertEquals("foo", extractor.getAttributeValue("href", "a href=\"foo\""));
   assertEquals("fo o", extractor.getAttributeValue("href", "a href  =\"fo o\""));
   assertEquals("fo'o", extractor.getAttributeValue("href", "a href=  \"fo'o\""));
   assertEquals("foo", extractor.getAttributeValue("href", "a href  =\"foo\""));
   assertEquals("foo", extractor.getAttributeValue("href", "a href='foo'"));
   assertEquals("foo", extractor.getAttributeValue("href", "a href='foo'"));
   assertEquals("fo o", extractor.getAttributeValue("href", "a href  ='fo o'"));
   assertEquals("fo\"o", extractor.getAttributeValue("href", "a href=  'fo\"o'"));
   assertEquals("foo", extractor.getAttributeValue("href", "a href  ='foo'"));
   // empty quoted strings
   assertEquals("", extractor.getAttributeValue("href", "a href=\"\""));
   assertEquals("", extractor.getAttributeValue("href", "a href=''"));
   // dangling quoted strings
   assertEquals("", extractor.getAttributeValue("href", "a href=\""));
   assertEquals("xy", extractor.getAttributeValue("href", "a href=\"xy"));
   assertEquals(
       "/cgi/reprint/21/1/2.pdf",
       extractor.getAttributeValue(
           "href",
           "a target=\"_self\" href=\"/cgi/reprint/21/1/2.pdf\" onclick=\"cancelLoadPDF()\""));
 }