Exemplo n.º 1
0
  /**
   * Update URLs inside the page, so those URLs which must be correct at page load time resolve
   * correctly to absolute URLs.
   *
   * <p>This means ensuring there is a BASE HREF tag, adding one if missing, and then resolving:
   * FRAME-SRC, META-URL, LINK-HREF, SCRIPT-SRC tag-attribute pairs against either the existing
   * BASE-HREF, or the page's absolute URL if it was missing.
   */
  public void resolvePageUrls() {

    // TODO: get url from Resource instead of SearchResult?
    String pageUrl = result.getOriginalUrl();
    String captureDate = result.getCaptureTimestamp();

    String existingBaseHref = TagMagix.getBaseHref(sb);
    if (existingBaseHref == null) {
      insertAtStartOfHead("<base href=\"" + pageUrl + "\" />");
    } else {
      pageUrl = existingBaseHref;
    }

    String markups[][] = {
      {"FRAME", "SRC"},
      {"META", "URL"},
      {"LINK", "HREF"},
      {"SCRIPT", "SRC"},
      {TagMagix.ANY_TAGNAME, "background"}
    };
    // TODO: The classic WM added a js_ to the datespec, so NotInArchives
    // can return an valid javascript doc, and not cause Javascript errors.
    for (String tagAttr[] : markups) {
      TagMagix.markupTagREURIC(sb, uriConverter, captureDate, pageUrl, tagAttr[0], tagAttr[1]);
    }
    TagMagix.markupCSSImports(sb, uriConverter, captureDate, pageUrl);
    TagMagix.markupStyleUrls(sb, uriConverter, captureDate, pageUrl);
  }
Exemplo n.º 2
0
  /**
   * Update all URLs inside the page, so they resolve correctly to absolute URLs within the Wayback
   * service.
   */
  public void resolveAllPageUrls() {

    // TODO: get url from Resource instead of SearchResult?
    String pageUrl = result.getOriginalUrl();
    String captureDate = result.getCaptureTimestamp();

    String existingBaseHref = TagMagix.getBaseHref(sb);
    if (existingBaseHref != null) {
      pageUrl = existingBaseHref;
    }
    ResultURIConverter ruc = new SpecialResultURIConverter(uriConverter);

    // TODO: forms...?
    String markups[][] = {
      {"FRAME", "SRC"},
      {"META", "URL"},
      {"LINK", "HREF"},
      {"SCRIPT", "SRC"},
      {"IMG", "SRC"},
      {"A", "HREF"},
      {"AREA", "HREF"},
      {"OBJECT", "CODEBASE"},
      {"OBJECT", "CDATA"},
      {"APPLET", "CODEBASE"},
      {"APPLET", "ARCHIVE"},
      {"EMBED", "SRC"},
      {"IFRAME", "SRC"},
      {TagMagix.ANY_TAGNAME, "background"}
    };
    for (String tagAttr[] : markups) {
      TagMagix.markupTagREURIC(sb, ruc, captureDate, pageUrl, tagAttr[0], tagAttr[1]);
    }
    TagMagix.markupCSSImports(sb, uriConverter, captureDate, pageUrl);
    TagMagix.markupStyleUrls(sb, uriConverter, captureDate, pageUrl);
  }
Exemplo n.º 3
0
  public void resolveASXRefUrls() {

    // TODO: get url from Resource instead of SearchResult?
    String pageUrl = result.getOriginalUrl();
    String captureDate = result.getCaptureTimestamp();
    ResultURIConverter ruc = new MMSToHTTPResultURIConverter(uriConverter);

    TagMagix.markupTagREURIC(sb, ruc, captureDate, pageUrl, "REF", "HREF");
  }
Exemplo n.º 4
0
 /**
  * add all SearchResult objects from the SearchResults which fall within the time range of this
  * partition into this partition.
  *
  * @param results
  */
 public void filter(CaptureSearchResults results) {
   Iterator<CaptureSearchResult> itr = results.iterator();
   while (itr.hasNext()) {
     CaptureSearchResult result = itr.next();
     String captureDate = result.getCaptureTimestamp();
     if ((captureDate.compareTo(startDateStr) >= 0) && (captureDate.compareTo(endDateStr) < 0)) {
       matches.add(result);
     }
   }
 }
Exemplo n.º 5
0
  public void addBase() {

    // TODO: get url from Resource instead of SearchResult?
    String pageUrl = result.getOriginalUrl();
    String captureDate = result.getCaptureTimestamp();

    String existingBaseHref = TagMagix.getBaseHref(sb);
    if (existingBaseHref == null) {
      insertAtStartOfHead("<base href=\"" + pageUrl + "\" />");
    } else {
      pageUrl = existingBaseHref;
    }
  }
 public void testAdaptStream() throws IOException {
   InputStream is = new FileInputStream(new File(TestInfo.WORKING_DIR, DEDUP_CRAWL_LOG));
   OutputStream os = new ByteArrayOutputStream();
   DeduplicateToCDXAdapterInterface adapter = new DeduplicateToCDXAdapter();
   adapter.adaptStream(is, os);
   os.close();
   String output = os.toString();
   String[] lines = output.split("\n");
   CDXLineToSearchResultAdapter adapter2 = new CDXLineToSearchResultAdapter();
   for (String line : lines) {
     CaptureSearchResult csr = adapter2.adapt(line);
     assertNotNull(
         "Should have a valid mime type for every line, inclding '" + line + "'",
         csr.getMimeType());
   }
   assertTrue("expect at least 3 lines of output, got " + lines.length, lines.length > 2);
 }
  public void testAdaptLine() {
    DeduplicateToCDXAdapterInterface adapter = new DeduplicateToCDXAdapter();
    String cdx_line = adapter.adaptLine(DEDUP_CRAWL_STRING);
    CDXLineToSearchResultAdapter adapter2 = new CDXLineToSearchResultAdapter();
    CaptureSearchResult result = adapter2.adapt(cdx_line);
    assertEquals(
        "Should get the arcfilename back out of the cdx line",
        "1-1-20090513141823-00008-sb-test-har-001.statsbiblioteket.dk.arc",
        result.getFile());
    assertEquals("Should get the right http code out of the cdx line", "200", result.getHttpCode());

    String cdx_line2 = adapter.adaptLine(DEDUP_CRAWL_STRING2);
    CaptureSearchResult result2 = adapter2.adapt(cdx_line2);
    assertEquals(
        "Should get the arcfilename back out of the cdx line",
        "118657-119-20110428163750-00001-kb-prod-har-004.kb.dk.arc",
        result2.getFile());
    assertEquals(
        "Should get the right http code out of the cdx line", "200", result2.getHttpCode());
  }
Exemplo n.º 8
0
 public void resolveCSSUrls() {
   // TODO: get url from Resource instead of SearchResult?
   String pageUrl = result.getOriginalUrl();
   String captureDate = result.getCaptureTimestamp();
   TagMagix.markupCSSImports(sb, uriConverter, captureDate, pageUrl);
 }