Ejemplo n.º 1
0
  private void buildUrlSets(String url) {

    try {
      outputMessage("\nFetching " + url, TEST_SUMMARY_MESSAGE);
      URL srcUrl = new URL(url);
      //       URLConnection conn = srcUrl.openConnection();
      //       String type = conn.getContentType();
      //       type = conn.getHeaderField("content-type");
      //       InputStream istr = conn.getInputStream();

      LockssUrlConnection conn = UrlUtil.openConnection(url, connectionPool);
      if (proxyHost != null) {
        conn.setProxy(proxyHost, proxyPort);
      }
      if (userAgent != null) {
        conn.setRequestProperty("user-agent", userAgent);
      }
      try {
        conn.execute();
        int resp = conn.getResponseCode();
        if (resp != 200) {
          outputMessage("Resp: " + resp + ": " + conn.getResponseMessage(), TEST_SUMMARY_MESSAGE);
          return;
        }
        depth_fetched[m_curDepth - 1]++;
        String cookies = conn.getResponseHeaderValue("Set-Cookie");
        if (cookies != null) {
          outputMessage("Cookies: " + cookies, PLAIN_MESSAGE);
        }
        String type = conn.getResponseContentType();
        if (type == null || !type.toLowerCase().startsWith("text/html")) {
          outputMessage("Type: " + type + ", not parsing", URL_SUMMARY_MESSAGE);
          return;
        }
        outputMessage("Type: " + type + ", extracting Urls", URL_SUMMARY_MESSAGE);
        InputStream istr = conn.getResponseInputStream();
        InputStreamReader reader = new InputStreamReader(istr);
        //       MyMockCachedUrl mcu = new MyMockCachedUrl(srcUrl.toString(), reader);
        GoslingHtmlLinkExtractor extractor = new GoslingHtmlLinkExtractor();
        extractor.extractUrls(null, istr, null, srcUrl.toString(), new MyLinkExtractorCallback());
        istr.close();
        depth_parsed[m_curDepth - 1]++;
      } finally {
        conn.release();
      }
    } catch (MalformedURLException murle) {
      murle.printStackTrace();
      outputErrResults(url, "Malformed URL:" + murle.getMessage());
    } catch (IOException ex) {
      ex.printStackTrace();
      outputErrResults(url, "IOException: " + ex.getMessage());
    }
  }
  public void testRelativeLinksWithLeadingSlash() throws IOException {
    String url1 = "http://www.example.com/blah/branch1/index.html";
    String url2 = "http://www.example.com/blah/branch2/index.html";
    String url3 = "http://www.example.com/journals/american_imago/toc/aim60.1.html";
    String url4 = "http://www.example.com/css/foo.css";
    String url5 = "http://www.example.com/javascript/bar.js";
    String source =
        "<html><head><title>Test</title></head><body>"
            + "<a href= branch1/index.html>link1</a>"
            + "Filler, with <b>bold</b> tags and<i>others</i>"
            + "<a href=\" branch2/index.html\">link2</a>"
            + "<a href =\" /journals/american_imago/toc/aim60.1.html\">"
            + "<link rel=\"stylesheet\" href=\"/css/foo.css\" >"
            + "<script type=\"text/javascript\" src=\"/javascript/bar.js\"></script>"
            + "Number 1, Spring 2003</a>";

    MockCachedUrl mcu = new MockCachedUrl("http://www.example.com/blah/");
    mcu.setContent(source);

    extractor.extractUrls(
        mau, new StringInputStream(source), ENC, "http://www.example.com/blah/", cb);

    Set expected = SetUtil.set(url1, url2, url3, url4, url5);
    assertEquals(expected, cb.getFoundUrls());
  }
 public void testThrowsOnNullCallback() throws IOException {
   try {
     extractor.extractUrls(
         mau, new StringInputStream("blah"), ENC, "http://www.example.com/", null);
     fail("Calling extractUrls with a null callback should have thrown");
   } catch (IllegalArgumentException iae) {
   }
 }
 public void testThrowsOnNullSourceUrl() throws IOException {
   try {
     extractor.extractUrls(
         mau, new StringInputStream("Blah"), ENC, null, new MyLinkExtractorCallback());
     fail("Calling extractUrls with a null CachedUrl should have thrown");
   } catch (IllegalArgumentException iae) {
   }
 }
  private Set parseSingleSource(String source) throws IOException {
    MockArchivalUnit mau = new MockArchivalUnit();
    LinkExtractor ue = new RegexpCssLinkExtractor();
    mau.setLinkExtractor("text/css", ue);
    MockCachedUrl mcu = new MockCachedUrl("http://www.example.com", mau);
    mcu.setContent(source);

    cb.reset();
    extractor.extractUrls(mau, new StringInputStream(source), ENC, "http://www.example.com", cb);
    return cb.getFoundUrls();
  }
  public void testDoCrawlImageWithSrcInAltTagAfterSrcProper() throws IOException {
    String url = "http://www.example.com/link3.html";

    String source =
        "<html><head><title>Test</title></head><body>" + "<img src=" + url + " alt=src>link3</a>";

    MockCachedUrl mcu = new MockCachedUrl(startUrl);
    mcu.setContent(source);

    extractor.extractUrls(mau, new StringInputStream(source), ENC, startUrl, cb);

    Set expected = SetUtil.set(url);
    assertEquals(expected, cb.getFoundUrls());
  }
  public void testRelativeLinksWithSameName() throws IOException {
    String url1 = "http://www.example.com/branch1/index.html";
    String url2 = "http://www.example.com/branch2/index.html";

    String source =
        "<html><head><title>Test</title></head><body>"
            + "<a href=branch1/index.html>link1</a>"
            + "Filler, with <b>bold</b> tags and<i>others</i>"
            + "<a href=branch2/index.html>link2</a>";

    MockCachedUrl mcu = new MockCachedUrl("http://www.example.com");
    mcu.setContent(source);

    extractor.extractUrls(mau, new StringInputStream(source), ENC, "http://www.example.com", cb);

    Set expected = SetUtil.set(url1, url2);
    assertEquals(expected, cb.getFoundUrls());
  }
  private void singleTagParse(
      String url, String startTag, String endTag, ArchivalUnit au, boolean shouldParse)
      throws IOException {
    MockCachedUrl mcu = new MockCachedUrl("http://www.example.com");
    String content = makeContent(url, startTag, endTag);
    mcu.setContent(content);

    MyLinkExtractorCallback cb = new MyLinkExtractorCallback();
    extractor.extractUrls(mau, new StringInputStream(content), ENC, "http://www.example.com", cb);

    if (shouldParse) {
      Set expected = SetUtil.set(url);
      assertEquals("Misparsed: " + content, expected, cb.getFoundUrls());
    } else {
      Set expected = SetUtil.set();
      assertEquals("Misparsed: " + content, expected, cb.getFoundUrls());
    }
  }
 public void testGetAttribute() throws IOException {
   // no value found
   assertEquals(null, extractor.getAttributeValue("href", "a bar=foo"));
   assertEquals(null, extractor.getAttributeValue("href", "a href"));
   assertEquals(null, extractor.getAttributeValue("href", "a href="));
   assertEquals(null, extractor.getAttributeValue("href", "a href= "));
   // find proper attribute
   assertEquals("foo", extractor.getAttributeValue("tag", "a tag=foo tag=bar"));
   assertEquals("bar", extractor.getAttributeValue("tag", "a ta=foo tag=bar"));
   assertEquals("bar", extractor.getAttributeValue("tag", "a xy=foo\n tag=bar"));
   // whitespace
   assertEquals("foo", extractor.getAttributeValue("href", "a href=foo"));
   assertEquals("foo", extractor.getAttributeValue("href", "a href =foo"));
   assertEquals("foo", extractor.getAttributeValue("href", "a href = foo"));
   assertEquals("foo", extractor.getAttributeValue("href", "a href = foo\n"));
   assertEquals("foo", extractor.getAttributeValue("href", "a href= foo"));
   assertEquals("foo", extractor.getAttributeValue("href", "a href\t  = \n foo"));
   // quoted strings & whitespace
   assertEquals("foo", extractor.getAttributeValue("href", "a href=\"foo\""));
   assertEquals("foo", extractor.getAttributeValue("href", "a href=\"foo\""));
   assertEquals("fo o", extractor.getAttributeValue("href", "a href  =\"fo o\""));
   assertEquals("fo'o", extractor.getAttributeValue("href", "a href=  \"fo'o\""));
   assertEquals("foo", extractor.getAttributeValue("href", "a href  =\"foo\""));
   assertEquals("foo", extractor.getAttributeValue("href", "a href='foo'"));
   assertEquals("foo", extractor.getAttributeValue("href", "a href='foo'"));
   assertEquals("fo o", extractor.getAttributeValue("href", "a href  ='fo o'"));
   assertEquals("fo\"o", extractor.getAttributeValue("href", "a href=  'fo\"o'"));
   assertEquals("foo", extractor.getAttributeValue("href", "a href  ='foo'"));
   // empty quoted strings
   assertEquals("", extractor.getAttributeValue("href", "a href=\"\""));
   assertEquals("", extractor.getAttributeValue("href", "a href=''"));
   // dangling quoted strings
   assertEquals("", extractor.getAttributeValue("href", "a href=\""));
   assertEquals("xy", extractor.getAttributeValue("href", "a href=\"xy"));
   assertEquals(
       "/cgi/reprint/21/1/2.pdf",
       extractor.getAttributeValue(
           "href",
           "a target=\"_self\" href=\"/cgi/reprint/21/1/2.pdf\" onclick=\"cancelLoadPDF()\""));
 }