/** * Return the archive file type corresponding to the CU's MIME type or filename extension, or null * if none. */ public String getFromCu(CachedUrl cu) throws MalformedURLException { String res = getFromMime(cu.getContentType()); if (res == null) { res = getFromUrl(cu.getUrl()); } return res; }
private void deleteBlock(CachedUrl cu) throws IOException { log.info("deleting " + cu.getUrl()); CachedUrlSetSpec cuss = new SingleNodeCachedUrlSetSpec(cu.getUrl()); ArchivalUnit au = cu.getArchivalUnit(); CachedUrlSet cus = au.makeCachedUrlSet(cuss); NodeManager nm = au.getPlugin().getDaemon().getNodeManager(au); nm.deleteNode(cus); }
/** * Return true if, interpreting URLs as filenames, dirCu is a directory containing fileCu. Used to * exclude directory content from output files, so they can be unpacked by standard utilities * (e.g., unzip). Shouldn't be called with equal URLs, but return false in that case, as we * wouldn't want to exclude the URL */ boolean isDirOf(CachedUrl dirCu, CachedUrl fileCu) { String dir = dirCu.getUrl(); String file = fileCu.getUrl(); if (!dir.endsWith("/")) { dir = dir + "/"; } return file.startsWith(dir) && !file.equals(dir); }
public void testFunctionalFromTarHierarchy() throws Exception { log.debug3("in testFromTarHierarchy"); // load the tarballs InputStream file_input = null; try { file_input = getResourceAsStream(realTARFile_A); // UrlCacher uc = au.makeUrlCacher(TAR_A_BASE); // uc.storeContent(file_input, tarHeader); UrlCacher uc = tarAu.makeUrlCacher(new UrlData(file_input, tarHeader, TAR_A_BASE)); uc.storeContent(); IOUtil.safeClose(file_input); file_input = getResourceAsStream(realTARFile_B); // uc = au.makeUrlCacher(TAR_B_BASE); // uc.storeContent(file_input, tarHeader); uc = tarAu.makeUrlCacher(new UrlData(file_input, tarHeader, TAR_B_BASE)); uc.storeContent(); IOUtil.safeClose(file_input); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } finally { IOUtil.safeClose(file_input); } CachedUrlSet cus = tarAu.getAuCachedUrlSet(); for (CachedUrl cu : cus.getCuIterable()) { log.debug3("AU - cu is: " + cu.getUrl()); cu.release(); } // We need to start from the level of the ArticleMetadataExtractor MyListEmitter emitter = new MyListEmitter(); ArticleMetadataExtractor amEx = new ElsevierDeferredArticleMetadataExtractor(ArticleFiles.ROLE_ARTICLE_METADATA); Iterator<ArticleFiles> it = tarAu.getArticleIterator(MetadataTarget.Any()); while (it.hasNext()) { ArticleFiles af = it.next(); log.debug3("Metadata test - articlefiles " + af.toString()); // CachedUrl cu = af.getFullTextCu(); CachedUrl cu = af.getRoleCu(ArticleFiles.ROLE_ARTICLE_METADATA); log.debug3("metadata cu is " + cu.getUrl()); // List<ArticleMetadata> mdlist = mle.extract(MetadataTarget.Any(), cu); amEx.extract(MetadataTarget.Any(), af, emitter); List<ArticleMetadata> returnList = emitter.getAmList(); assertNotNull(returnList); log.debug3("size of returnList is " + returnList.size()); Iterator<ArticleMetadata> mdIt = returnList.iterator(); ArticleMetadata mdRecord = null; while (mdIt.hasNext()) { mdRecord = (ArticleMetadata) mdIt.next(); validateCompleteMetadataRecord(mdRecord); } } }
public void testArticleCountAndType() throws Exception { int expCount = 28; PluginTestUtil.crawlSimAu(sau); String pat1 = "branch(\\d+)/(\\d+file\\.html)"; String rep1 = "aps/journal/v123/n$1/full/$2"; PluginTestUtil.copyAu(sau, nau, ".*[^.][^p][^d][^f]$", pat1, rep1); String pat2 = "branch(\\d+)/(\\d+file\\.pdf)"; String rep2 = "aps/journal/v123/n$1/pdf/$2"; PluginTestUtil.copyAu(sau, nau, ".*\\.pdf$", pat2, rep2); // Remove some URLs int deleted = 0; for (Iterator it = nau.getAuCachedUrlSet().contentHashIterator(); it.hasNext(); ) { CachedUrlSetNode cusn = (CachedUrlSetNode) it.next(); if (cusn instanceof CachedUrl) { CachedUrl cu = (CachedUrl) cusn; String url = cu.getUrl(); if (url.contains("/journal/") && (url.endsWith("1file.html") || url.endsWith("2file.pdf"))) { deleteBlock(cu); ++deleted; } } } assertEquals(8, deleted); Iterator<ArticleFiles> it = nau.getArticleIterator(); int count = 0; int countHtmlOnly = 0; int countPdfOnly = 0; while (it.hasNext()) { ArticleFiles af = it.next(); log.info(af.toString()); CachedUrl cu = af.getFullTextCu(); String url = cu.getUrl(); assertNotNull(cu); String contentType = cu.getContentType(); log.debug("count " + count + " url " + url + " " + contentType); count++; if (af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF) == null) { ++countHtmlOnly; } if (af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF) == url) { ++countPdfOnly; } } log.debug("Article count is " + count); assertEquals(expCount, count); assertEquals(4, countHtmlOnly); assertEquals(4, countPdfOnly); }
@Override public void extract(MetadataTarget target, CachedUrl cu, Emitter emitter) throws IOException { ArticleMetadata am = new SimpleHtmlMetaTagMetadataExtractor().extract(target, cu); am.cook(tagMap); String url = am.get(MetadataField.FIELD_ACCESS_URL); ArchivalUnit au = cu.getArchivalUnit(); if (url == null || url.isEmpty() || !au.makeCachedUrl(url).hasContent()) { url = cu.getUrl(); } am.replace( MetadataField.FIELD_ACCESS_URL, HttpToHttpsUtil.AuUtil.normalizeHttpHttpsFromBaseUrl(au, url)); emitter.emitMetadata(cu, am); }
@Override public ArticleMetadata extract(MetadataTarget target, CachedUrl cu) throws IOException { ArticleMetadata am = super.extract(target, cu); am.cook(tagMap); String url = am.get(MetadataField.FIELD_ACCESS_URL); if (url != null && !url.isEmpty()) { CachedUrl val = cu.getArchivalUnit().makeCachedUrl(url); if (!val.hasContent()) { am.replace(MetadataField.FIELD_ACCESS_URL, cu.getUrl()); } } else { am.replace(MetadataField.FIELD_ACCESS_URL, cu.getUrl()); } return am; }
protected void writeCu(CachedUrl cu) throws IOException { String url = cu.getUrl(); long contentSize = cu.getContentSize(); CIProperties props = cu.getProperties(); long fetchTime = Long.parseLong(props.getProperty(CachedUrl.PROPERTY_FETCH_TIME)); InputStream contentIn = cu.getUnfilteredInputStream(); try { if (isResponse) { String hdrString = getHttpResponseString(cu); long size = contentSize + hdrString.length(); InputStream headerIn = new ReaderInputStream(new StringReader(hdrString)); InputStream concat = new SequenceInputStream(headerIn, contentIn); try { aw.write(xlateFilename(url), cu.getContentType(), getHostIp(), fetchTime, size, concat); } finally { IOUtil.safeClose(concat); } } else { aw.write( xlateFilename(url), cu.getContentType(), getHostIp(), fetchTime, cu.getContentSize(), contentIn); } } finally { AuUtil.safeRelease(cu); } }
private String getUrlContent(CachedUrl url) throws IOException { InputStream content = url.getUnfilteredInputStream(); ByteArrayOutputStream baos = new ByteArrayOutputStream(); StreamUtil.copy(content, baos); content.close(); String contentStr = new String(baos.toByteArray()); baos.close(); return contentStr; }
@Override public ArticleMetadata extract(MetadataTarget target, CachedUrl cu) throws IOException { log.debug3("Metadata - cachedurl cu:" + cu.getUrl()); ArticleMetadata am = super.extract(target, cu); am.cook(tagMap); return am; } // extract
protected String getHttpResponseString(CachedUrl cu) { Properties cuProps = cu.getProperties(); Properties filteredProps = filterResponseProps(cuProps); String hdrString = PropUtil.toHeaderString(filteredProps); StringBuilder sb = new StringBuilder(hdrString.length() + 30); String line1 = inferHttpResponseCode(cu, cuProps); sb.append(line1); sb.append(Constants.CRLF); sb.append(hdrString); sb.append(Constants.CRLF); return sb.toString(); }
/** * Lookup the CU's archive file type in its AU's ArchiveFileTypes * * @return the file extension (including dot), or null if none found */ public static String getArchiveExtension(CachedUrl cu) { ArchiveFileTypes aft = cu.getArchivalUnit().getArchiveFileTypes(); if (aft == null) { return null; } try { return aft.getFromCu(cu); } catch (MalformedURLException e) { log.warning("isArchive(" + cu + ")", e); return null; } }
protected void checkFilter(SimulatedArchivalUnit sau) throws Exception { log.debug("checkFilter()"); CachedUrl cu = sau.makeCachedUrl(sau.getUrlRoot() + "/001file.html"); enableFilter(sau, true); InputStream is = cu.openForHashing(); String expected = "001file.html This is file 1, depth 0, branch 0. foobar "; assertEquals(expected, StringUtil.fromInputStream(is)); is.close(); enableFilter(sau, false); cu = sau.makeCachedUrl(sau.getUrlRoot() + "/001file.html"); is = cu.openForHashing(); expected = "<HTML><HEAD><TITLE>001file.html</TITLE></HEAD><BODY>\n" + "This is file 1, depth 0, branch 0.<br><!-- comment --> " + "Citation String foobar<br><script>" + "(defun fact (n) (cond ((= n 0) 1) (t (fact (sub1 n)))))</script>\n" + "</BODY></HTML>"; assertEquals(expected, StringUtil.fromInputStream(is)); is.close(); }
private void writeFiles() { PlatformUtil platutil = PlatformUtil.getInstance(); CuIterator iter = AuUtil.getCuIterator(au); int errs = 0; CachedUrl curCu = null; CachedUrl nextCu = getNextCu(iter); while (nextCu != null) { curCu = nextCu; nextCu = getNextCu(iter); if (excludeDirNodes && nextCu != null && isDirOf(curCu, nextCu)) { continue; } CachedUrl[] cuVersions = curCu.getCuVersions(maxVersions > 0 ? maxVersions : Integer.MAX_VALUE); for (CachedUrl cu : cuVersions) { try { log.debug2("Exporting " + cu.getUrl()); writeCu(cu); } catch (IOException e) { if (platutil.isDiskFullError(e)) { recordError("Disk full, can't write export file."); isDiskFull = true; return; } } catch (Exception e) { // XXX Would like to differentiate between errors opening or // reading CU, which shouldn't cause abort, and errors writing // to export file, which should. recordError("Unable to copy " + cu.getUrl(), e); if (errs++ >= maxErrors) { recordError("Aborting after " + errs + " errors"); return; } } } } }
/* * hasArticleMetadata(CachedUrl cu) * Given the CachedUrl for the potential abstract file, using the existing * SimpleHtmlMetaTagMetadataExtractor to parse the file and * retrieve any contained metadata. If a doi or author exists, it's an article * NOT defining the Metadata Extractor here! */ private boolean hasArticleMetadata(CachedUrl cu) { MetadataTarget at = new MetadataTarget(MetadataTarget.PURPOSE_ARTICLE); ArticleMetadata am; SimpleHtmlMetaTagMetadataExtractor ext = new SimpleHtmlMetaTagMetadataExtractor(); if (cu != null && cu.hasContent()) { try { at.setFormat("text/html"); am = ext.extract(at, cu); if ((am.containsRawKey("bepress_citation_journal_title")) || (am.containsRawKey("bepress_citation_abstract_html_url")) || (am.containsRawKey("bepress_citation_doi")) || (am.containsRawKey("bepress_citation_author"))) { return true; } } catch (IOException e) { e.printStackTrace(); } } return false; // no reasonable metadata, probably a toc }
/* * This is comlicated. MOST AUs have articles that live below and issue level TOC * that is, * <blah>/<journal_id>/vol#/iss#/ is a toc with no relevant metadata * <blah>/<journal_id>/vol#/iss#/xxx is an article with metadata * (eg Economist Voice V1) * BUT * in some AUs there are issues with only 1 article, in which case * <blah>/<journal_id>/vol#/iss#/ is an abstract with metadata * (eg Rhodes Cook V4) * and a few AUs with a mixture * (eg Forum for Health Economics V5) * So to identify ALL articles, we'll also have to capture issue level items and then look * at the html and if it has article metadata in it, count it as an article. * */ @Override protected ArticleFiles createArticleFiles(CachedUrl cu) { String url = cu.getUrl(); Matcher mat = pattern.matcher(url); if (mat.find()) { // we matched, but could this pattern potentially be a toc? Matcher tocmat = TOC_pattern.matcher(url); // if we could be a TOC then we must have metadata to be considered an article if (tocmat.find()) { if (hasArticleMetadata(cu)) { return processUrl(cu, mat); } } else { // we're not a potential TOC, so treat this as an article without checking return processUrl(cu, mat); } return null; // this was a TOC, not an article } log.warning("Mismatch between article iterator factory and article iterator: " + url); return null; }
/** Explode the archive into its constituent elements */ public void explode() throws CacheException { CachedUrl cachedUrl = null; int goodEntries = 0; int badEntries = 0; int ignoredEntries = 0; int entriesBetweenSleep = 0; ArchiveReader arcReader = null; logger.info( (storeArchive ? "Storing" : "Fetching") + " WARC file: " + origUrl + " will explode"); try { if (storeArchive) { UrlCacher uc = au.makeUrlCacher(new UrlData(arcStream, arcProps, fetchUrl)); BitSet bs = new BitSet(); bs.set(UrlCacher.DONT_CLOSE_INPUT_STREAM_FLAG); uc.setFetchFlags(bs); uc.storeContent(); archiveData.resetInputStream(); arcStream = archiveData.input; } // Wrap it in an ArchiveReader logger.debug3("About to wrap stream"); arcReader = wrapStream(fetchUrl, arcStream); logger.debug3("wrapStream() returns " + (arcReader == null ? "null" : "non-null")); // Explode it if (arcReader == null) { throw new CacheException.ExploderException("no WarcReader for " + origUrl); } ArchivalUnit au = crawlFacade.getAu(); Set stemSet = new HashSet(); logger.debug("Exploding " + fetchUrl); // Iterate through the elements in the WARC file, except the first Iterator i = arcReader.iterator(); // Skip first record for (i.next(); i.hasNext(); ) { // XXX probably not necessary helper.pokeWDog(); if ((++entriesBetweenSleep % sleepAfter) == 0) { long pauseTime = CurrentConfig.getTimeIntervalParam(PARAM_RETRY_PAUSE, DEFAULT_RETRY_PAUSE); Deadline pause = Deadline.in(pauseTime); logger.debug3("Sleeping for " + StringUtil.timeIntervalToString(pauseTime)); while (!pause.expired()) { try { pause.sleep(); } catch (InterruptedException ie) { // no action } } } ArchiveRecord element = (ArchiveRecord) i.next(); // Each element is a URL to be cached in a suitable AU ArchiveRecordHeader elementHeader = element.getHeader(); String elementUrl = elementHeader.getUrl(); String elementMimeType = elementHeader.getMimetype(); long elementLength = elementHeader.getLength(); logger.debug2("WARC url " + elementUrl + " mime " + elementMimeType); if (elementUrl.startsWith("http:")) { ArchiveEntry ae = new ArchiveEntry( elementUrl, elementLength, 0, // XXX need to convert getDate string to long element, // ArchiveRecord extends InputStream this, fetchUrl); ae.setHeaderFields(makeCIProperties(elementHeader)); long bytesStored = elementLength; logger.debug3("ArchiveEntry: " + ae.getName() + " bytes " + bytesStored); try { helper.process(ae); } catch (PluginException ex) { throw new CacheException.ExploderException("helper.process() threw", ex); } if (ae.getBaseUrl() != null) { if (ae.getRestOfUrl() != null && ae.getHeaderFields() != null) { storeEntry(ae); handleAddText(ae); goodEntries++; crawlFacade.getCrawlerStatus().addContentBytesFetched(bytesStored); } else { ignoredEntries++; } } else { badEntries++; logger.debug2("Can't map " + elementUrl + " from " + archiveUrl); } } } } catch (IOException ex) { throw new CacheException.ExploderException(ex); } finally { if (arcReader != null) try { arcReader.close(); arcReader = null; } catch (IOException ex) { throw new CacheException.ExploderException(ex); } if (cachedUrl != null) { cachedUrl.release(); } IOUtil.safeClose(arcStream); } if (badEntries == 0 && goodEntries > 0) { // Make it look like a new crawl finished on each AU to which // URLs were added. for (Iterator it = touchedAus.iterator(); it.hasNext(); ) { ArchivalUnit au = (ArchivalUnit) it.next(); logger.debug3(archiveUrl + " touching " + au.toString()); AuUtil.getDaemon(au).getNodeManager(au).newContentCrawlFinished(); } } else { ArchivalUnit au = crawlFacade.getAu(); String msg = archiveUrl + ": " + badEntries + "/" + goodEntries + " bad entries"; throw new CacheException.UnretryableException(msg); } }