private boolean startReindexingMetadata(ArchivalUnit au, boolean force) { if (metadataMgr == null) { errMsg = "Metadata processing is not enabled."; return false; } if (!force) { if (!AuUtil.hasCrawled(au)) { errMsg = "Au has never crawled. Click again to reindex metadata"; showForceReindexMetadata = true; return false; } AuState auState = AuUtil.getAuState(au); switch (auState.getSubstanceState()) { case No: errMsg = "Au has no substance. Click again to reindex metadata"; showForceReindexMetadata = true; return false; case Unknown: errMsg = "Unknown substance for Au. Click again to reindex metadata."; showForceReindexMetadata = true; return false; case Yes: // fall through } } // Fully reindex metadata with the highest priority. Connection conn = null; PreparedStatement insertPendingAuBatchStatement = null; try { conn = dbMgr.getConnection(); insertPendingAuBatchStatement = metadataMgr.getPrioritizedInsertPendingAuBatchStatement(conn); if (metadataMgr.enableAndAddAuToReindex( au, conn, insertPendingAuBatchStatement, false, true)) { statusMsg = "Reindexing metadata for " + au.getName(); return true; } } catch (DbException dbe) { log.error("Cannot reindex metadata for " + au.getName(), dbe); } finally { DbManager.safeCloseStatement(insertPendingAuBatchStatement); DbManager.safeRollbackAndClose(conn); } if (force) { errMsg = "Still cannot reindex metadata for " + au.getName(); } else { errMsg = "Cannot reindex metadata for " + au.getName(); } return false; }
/** * Return a TitleConfig for the AU. Returns matching entry from the title db if found, else * creates one */ TitleConfig titleConfigFromAu(InactiveAuProxy au) { PluginProxy plugin = au.getPlugin(); String auname = au.getName(); Configuration auConfig = au.getConfiguration(); TitleConfig tc = AuUtil.findTitleConfig(auConfig, plugin.getPlugin()); if (tc != null) { return tc; } tc = new TitleConfig(auname, plugin.getPluginId()); ArrayList<ConfigParamAssignment> params = new ArrayList(); for (Iterator iter = auConfig.keyIterator(); iter.hasNext(); ) { String key = (String) iter.next(); if (!ConfigParamDescr.isReservedParam(key)) { String val = auConfig.get(key); ConfigParamDescr descr = plugin.findAuConfigDescr(key); if (descr != null) { ConfigParamAssignment cpa = new ConfigParamAssignment(descr, val); params.add(cpa); } else { log.warning("Unknown parameter key: " + key + " in au: " + auname); } } } params.trimToSize(); tc.setParams(params); return tc; }
protected void writeCu(CachedUrl cu) throws IOException { String url = cu.getUrl(); long contentSize = cu.getContentSize(); CIProperties props = cu.getProperties(); long fetchTime = Long.parseLong(props.getProperty(CachedUrl.PROPERTY_FETCH_TIME)); InputStream contentIn = cu.getUnfilteredInputStream(); try { if (isResponse) { String hdrString = getHttpResponseString(cu); long size = contentSize + hdrString.length(); InputStream headerIn = new ReaderInputStream(new StringReader(hdrString)); InputStream concat = new SequenceInputStream(headerIn, contentIn); try { aw.write(xlateFilename(url), cu.getContentType(), getHostIp(), fetchTime, size, concat); } finally { IOUtil.safeClose(concat); } } else { aw.write( xlateFilename(url), cu.getContentType(), getHostIp(), fetchTime, cu.getContentSize(), contentIn); } } finally { AuUtil.safeRelease(cu); } }
private void checkSubstance(ArchivalUnit au) { SubstanceChecker subChecker = new SubstanceChecker(au); if (!subChecker.isEnabled()) { errMsg = "No substance patterns defined for plugin."; return; } AuState auState = AuUtil.getAuState(au); SubstanceChecker.State oldState = auState.getSubstanceState(); SubstanceChecker.State newState = subChecker.findSubstance(); String chtxt = (newState == oldState ? "(unchanged)" : "(was " + oldState.toString() + ")"); switch (newState) { case Unknown: log.error("Shouldn't happen: SubstanceChecker returned Unknown"); errMsg = "Error in SubstanceChecker; see log."; break; case Yes: statusMsg = "AU has substance " + chtxt + ": " + au.getName(); auState.setSubstanceState(SubstanceChecker.State.Yes); break; case No: statusMsg = "AU has no substance " + chtxt + ": " + au.getName(); auState.setSubstanceState(SubstanceChecker.State.No); break; } }
private List getSummaryInfo(CrawlerStatus status) { List res = new ArrayList(); StatusTable.SummaryInfo statusSi = new StatusTable.SummaryInfo( "Status", ColumnDescriptor.TYPE_STRING, status.getCrawlStatusMsg()); ArchivalUnit au = status.getAu(); if (au != null) { AuState aus = AuUtil.getAuState(au); if (status.getCrawlStatus() == Crawler.STATUS_SUCCESSFUL && aus.hasNoSubstance()) { statusSi.setValueFootnote(FOOT_NO_SUBSTANCE_CRAWL_STATUS); } } res.add(statusSi); String sources = StringUtil.separatedString(status.getSources()); res.add(new StatusTable.SummaryInfo("Source", ColumnDescriptor.TYPE_STRING, sources)); String startUrls = StringUtil.separatedString(status.getStartUrls()); res.add( new StatusTable.SummaryInfo("Starting Url(s)", ColumnDescriptor.TYPE_STRING, startUrls)); return res; }
private void writeFiles() { PlatformUtil platutil = PlatformUtil.getInstance(); CuIterator iter = AuUtil.getCuIterator(au); int errs = 0; CachedUrl curCu = null; CachedUrl nextCu = getNextCu(iter); while (nextCu != null) { curCu = nextCu; nextCu = getNextCu(iter); if (excludeDirNodes && nextCu != null && isDirOf(curCu, nextCu)) { continue; } CachedUrl[] cuVersions = curCu.getCuVersions(maxVersions > 0 ? maxVersions : Integer.MAX_VALUE); for (CachedUrl cu : cuVersions) { try { log.debug2("Exporting " + cu.getUrl()); writeCu(cu); } catch (IOException e) { if (platutil.isDiskFullError(e)) { recordError("Disk full, can't write export file."); isDiskFull = true; return; } } catch (Exception e) { // XXX Would like to differentiate between errors opening or // reading CU, which shouldn't cause abort, and errors writing // to export file, which should. recordError("Unable to copy " + cu.getUrl(), e); if (errs++ >= maxErrors) { recordError("Aborting after " + errs + " errors"); return; } } } } }
/** engqueue a size calculation for the AU */ public void queueSizeCalc(ArchivalUnit au) { queueSizeCalc(AuUtil.getAuRepoNode(au)); }
/** Explode the archive into its constituent elements */ public void explode() throws CacheException { CachedUrl cachedUrl = null; int goodEntries = 0; int badEntries = 0; int ignoredEntries = 0; int entriesBetweenSleep = 0; ArchiveReader arcReader = null; logger.info( (storeArchive ? "Storing" : "Fetching") + " WARC file: " + origUrl + " will explode"); try { if (storeArchive) { UrlCacher uc = au.makeUrlCacher(new UrlData(arcStream, arcProps, fetchUrl)); BitSet bs = new BitSet(); bs.set(UrlCacher.DONT_CLOSE_INPUT_STREAM_FLAG); uc.setFetchFlags(bs); uc.storeContent(); archiveData.resetInputStream(); arcStream = archiveData.input; } // Wrap it in an ArchiveReader logger.debug3("About to wrap stream"); arcReader = wrapStream(fetchUrl, arcStream); logger.debug3("wrapStream() returns " + (arcReader == null ? "null" : "non-null")); // Explode it if (arcReader == null) { throw new CacheException.ExploderException("no WarcReader for " + origUrl); } ArchivalUnit au = crawlFacade.getAu(); Set stemSet = new HashSet(); logger.debug("Exploding " + fetchUrl); // Iterate through the elements in the WARC file, except the first Iterator i = arcReader.iterator(); // Skip first record for (i.next(); i.hasNext(); ) { // XXX probably not necessary helper.pokeWDog(); if ((++entriesBetweenSleep % sleepAfter) == 0) { long pauseTime = CurrentConfig.getTimeIntervalParam(PARAM_RETRY_PAUSE, DEFAULT_RETRY_PAUSE); Deadline pause = Deadline.in(pauseTime); logger.debug3("Sleeping for " + StringUtil.timeIntervalToString(pauseTime)); while (!pause.expired()) { try { pause.sleep(); } catch (InterruptedException ie) { // no action } } } ArchiveRecord element = (ArchiveRecord) i.next(); // Each element is a URL to be cached in a suitable AU ArchiveRecordHeader elementHeader = element.getHeader(); String elementUrl = elementHeader.getUrl(); String elementMimeType = elementHeader.getMimetype(); long elementLength = elementHeader.getLength(); logger.debug2("WARC url " + elementUrl + " mime " + elementMimeType); if (elementUrl.startsWith("http:")) { ArchiveEntry ae = new ArchiveEntry( elementUrl, elementLength, 0, // XXX need to convert getDate string to long element, // ArchiveRecord extends InputStream this, fetchUrl); ae.setHeaderFields(makeCIProperties(elementHeader)); long bytesStored = elementLength; logger.debug3("ArchiveEntry: " + ae.getName() + " bytes " + bytesStored); try { helper.process(ae); } catch (PluginException ex) { throw new CacheException.ExploderException("helper.process() threw", ex); } if (ae.getBaseUrl() != null) { if (ae.getRestOfUrl() != null && ae.getHeaderFields() != null) { storeEntry(ae); handleAddText(ae); goodEntries++; crawlFacade.getCrawlerStatus().addContentBytesFetched(bytesStored); } else { ignoredEntries++; } } else { badEntries++; logger.debug2("Can't map " + elementUrl + " from " + archiveUrl); } } } } catch (IOException ex) { throw new CacheException.ExploderException(ex); } finally { if (arcReader != null) try { arcReader.close(); arcReader = null; } catch (IOException ex) { throw new CacheException.ExploderException(ex); } if (cachedUrl != null) { cachedUrl.release(); } IOUtil.safeClose(arcStream); } if (badEntries == 0 && goodEntries > 0) { // Make it look like a new crawl finished on each AU to which // URLs were added. for (Iterator it = touchedAus.iterator(); it.hasNext(); ) { ArchivalUnit au = (ArchivalUnit) it.next(); logger.debug3(archiveUrl + " touching " + au.toString()); AuUtil.getDaemon(au).getNodeManager(au).newContentCrawlFinished(); } } else { ArchivalUnit au = crawlFacade.getAu(); String msg = archiveUrl + ": " + badEntries + "/" + goodEntries + " bad entries"; throw new CacheException.UnretryableException(msg); } }