/** * Update URLs inside the page, so those URLs which must be correct at page load time resolve * correctly to absolute URLs. * * <p>This means ensuring there is a BASE HREF tag, adding one if missing, and then resolving: * FRAME-SRC, META-URL, LINK-HREF, SCRIPT-SRC tag-attribute pairs against either the existing * BASE-HREF, or the page's absolute URL if it was missing. */ public void resolvePageUrls() { // TODO: get url from Resource instead of SearchResult? String pageUrl = result.getOriginalUrl(); String captureDate = result.getCaptureTimestamp(); String existingBaseHref = TagMagix.getBaseHref(sb); if (existingBaseHref == null) { insertAtStartOfHead("<base href=\"" + pageUrl + "\" />"); } else { pageUrl = existingBaseHref; } String markups[][] = { {"FRAME", "SRC"}, {"META", "URL"}, {"LINK", "HREF"}, {"SCRIPT", "SRC"}, {TagMagix.ANY_TAGNAME, "background"} }; // TODO: The classic WM added a js_ to the datespec, so NotInArchives // can return an valid javascript doc, and not cause Javascript errors. for (String tagAttr[] : markups) { TagMagix.markupTagREURIC(sb, uriConverter, captureDate, pageUrl, tagAttr[0], tagAttr[1]); } TagMagix.markupCSSImports(sb, uriConverter, captureDate, pageUrl); TagMagix.markupStyleUrls(sb, uriConverter, captureDate, pageUrl); }
/** * Update all URLs inside the page, so they resolve correctly to absolute URLs within the Wayback * service. */ public void resolveAllPageUrls() { // TODO: get url from Resource instead of SearchResult? String pageUrl = result.getOriginalUrl(); String captureDate = result.getCaptureTimestamp(); String existingBaseHref = TagMagix.getBaseHref(sb); if (existingBaseHref != null) { pageUrl = existingBaseHref; } ResultURIConverter ruc = new SpecialResultURIConverter(uriConverter); // TODO: forms...? String markups[][] = { {"FRAME", "SRC"}, {"META", "URL"}, {"LINK", "HREF"}, {"SCRIPT", "SRC"}, {"IMG", "SRC"}, {"A", "HREF"}, {"AREA", "HREF"}, {"OBJECT", "CODEBASE"}, {"OBJECT", "CDATA"}, {"APPLET", "CODEBASE"}, {"APPLET", "ARCHIVE"}, {"EMBED", "SRC"}, {"IFRAME", "SRC"}, {TagMagix.ANY_TAGNAME, "background"} }; for (String tagAttr[] : markups) { TagMagix.markupTagREURIC(sb, ruc, captureDate, pageUrl, tagAttr[0], tagAttr[1]); } TagMagix.markupCSSImports(sb, uriConverter, captureDate, pageUrl); TagMagix.markupStyleUrls(sb, uriConverter, captureDate, pageUrl); }
public void resolveASXRefUrls() { // TODO: get url from Resource instead of SearchResult? String pageUrl = result.getOriginalUrl(); String captureDate = result.getCaptureTimestamp(); ResultURIConverter ruc = new MMSToHTTPResultURIConverter(uriConverter); TagMagix.markupTagREURIC(sb, ruc, captureDate, pageUrl, "REF", "HREF"); }
/** * add all SearchResult objects from the SearchResults which fall within the time range of this * partition into this partition. * * @param results */ public void filter(CaptureSearchResults results) { Iterator<CaptureSearchResult> itr = results.iterator(); while (itr.hasNext()) { CaptureSearchResult result = itr.next(); String captureDate = result.getCaptureTimestamp(); if ((captureDate.compareTo(startDateStr) >= 0) && (captureDate.compareTo(endDateStr) < 0)) { matches.add(result); } } }
public void addBase() { // TODO: get url from Resource instead of SearchResult? String pageUrl = result.getOriginalUrl(); String captureDate = result.getCaptureTimestamp(); String existingBaseHref = TagMagix.getBaseHref(sb); if (existingBaseHref == null) { insertAtStartOfHead("<base href=\"" + pageUrl + "\" />"); } else { pageUrl = existingBaseHref; } }
public void testAdaptStream() throws IOException { InputStream is = new FileInputStream(new File(TestInfo.WORKING_DIR, DEDUP_CRAWL_LOG)); OutputStream os = new ByteArrayOutputStream(); DeduplicateToCDXAdapterInterface adapter = new DeduplicateToCDXAdapter(); adapter.adaptStream(is, os); os.close(); String output = os.toString(); String[] lines = output.split("\n"); CDXLineToSearchResultAdapter adapter2 = new CDXLineToSearchResultAdapter(); for (String line : lines) { CaptureSearchResult csr = adapter2.adapt(line); assertNotNull( "Should have a valid mime type for every line, inclding '" + line + "'", csr.getMimeType()); } assertTrue("expect at least 3 lines of output, got " + lines.length, lines.length > 2); }
public void testAdaptLine() { DeduplicateToCDXAdapterInterface adapter = new DeduplicateToCDXAdapter(); String cdx_line = adapter.adaptLine(DEDUP_CRAWL_STRING); CDXLineToSearchResultAdapter adapter2 = new CDXLineToSearchResultAdapter(); CaptureSearchResult result = adapter2.adapt(cdx_line); assertEquals( "Should get the arcfilename back out of the cdx line", "1-1-20090513141823-00008-sb-test-har-001.statsbiblioteket.dk.arc", result.getFile()); assertEquals("Should get the right http code out of the cdx line", "200", result.getHttpCode()); String cdx_line2 = adapter.adaptLine(DEDUP_CRAWL_STRING2); CaptureSearchResult result2 = adapter2.adapt(cdx_line2); assertEquals( "Should get the arcfilename back out of the cdx line", "118657-119-20110428163750-00001-kb-prod-har-004.kb.dk.arc", result2.getFile()); assertEquals( "Should get the right http code out of the cdx line", "200", result2.getHttpCode()); }
public void resolveCSSUrls() { // TODO: get url from Resource instead of SearchResult? String pageUrl = result.getOriginalUrl(); String captureDate = result.getCaptureTimestamp(); TagMagix.markupCSSImports(sb, uriConverter, captureDate, pageUrl); }