/** * Compiles and returns a report on its status. * * @param name Report name. * @param pw Where to print. */ public void reportTo(String name, PrintWriter pw) { // name is ignored for now: only one kind of report pw.print("["); pw.println(getName()); // Make a local copy of the currentCuri reference in case it gets // nulled while we're using it. We're doing this because // alternative is synchronizing and we don't want to do this -- // it causes hang ups as controller waits on a lock for this thread, // something it gets easily enough on old threading model but something // it can wait interminably for on NPTL threading model. // See [ 994946 ] Pause/Terminate ignored on 2.6 kernel 1.5 JVM. CrawlURI c = currentCuri; if (c != null) { pw.print(" "); c.shortReportLineTo(pw); pw.print(" "); pw.print(c.getFetchAttempts()); pw.print(" attempts"); pw.println(); pw.print(" "); pw.print("in processor: "); pw.print(currentProcessorName); } else { pw.print(" -no CrawlURI- "); } pw.println(); long now = System.currentTimeMillis(); long time = 0; pw.print(" "); if (lastFinishTime > lastStartTime) { // That means we finished something after we last started something // or in other words we are not working on anything. pw.print("WAITING for "); time = now - lastFinishTime; } else if (lastStartTime > 0) { // We are working on something pw.print("ACTIVE for "); time = now - lastStartTime; } pw.print(ArchiveUtils.formatMillisecondsToConventional(time)); pw.println(); pw.print(" "); pw.print("step: "); pw.print(step); pw.print(" for "); pw.print( ArchiveUtils.formatMillisecondsToConventional(System.currentTimeMillis() - atStepSince)); pw.println(); reportThread(this, pw); pw.print("]"); pw.println(); pw.flush(); }
public Map<String, Object> shortReportMap() { Map<String, Object> data = new LinkedHashMap<String, Object>(); data.put("serialNumber", serialNumber); CrawlURI c = currentCuri; if (c != null) { data.put("currentURI", c.toString()); data.put("currentProcessor", currentProcessorName); data.put("fetchAttempts", c.getFetchAttempts()); } else { data.put("currentURI", null); } long now = System.currentTimeMillis(); long time = 0; if (lastFinishTime > lastStartTime) { data.put("status", "WAITING"); time = now - lastFinishTime; } else if (lastStartTime > 0) { data.put("status", "ACTIVE"); time = now - lastStartTime; } data.put("currentStatusElapsedMilliseconds", time); data.put("currentStatusElapsedPretty", ArchiveUtils.formatMillisecondsToConventional(time)); data.put("step", step); return data; }
{ Random rand = new Random(1); try { byte[] buf = new byte[1024]; rand.nextBytes(buf); noise1k_gz = ArchiveUtils.gzip(buf); buf = new byte[32 * 1024]; rand.nextBytes(buf); noise32k_gz = ArchiveUtils.gzip(buf); a_gz = ArchiveUtils.gzip("a".getBytes("ASCII")); hello_gz = ArchiveUtils.gzip("hello".getBytes("ASCII")); allfour_gz = Bytes.concat(noise1k_gz, noise32k_gz, a_gz, hello_gz); sixsmall_gz = Bytes.concat(a_gz, hello_gz, a_gz, hello_gz, a_gz, hello_gz); } catch (IOException e) { // should not happen } }
/** @param w PrintWriter to write to. */ public void shortReportLineTo(PrintWriter w) { w.print("#"); w.print(this.serialNumber); // Make a local copy of the currentCuri reference in case it gets // nulled while we're using it. We're doing this because // alternative is synchronizing and we don't want to do this -- // it causes hang ups as controller waits on a lock for this thread, // something it gets easily enough on old threading model but something // it can wait interminably for on NPTL threading model. // See [ 994946 ] Pause/Terminate ignored on 2.6 kernel 1.5 JVM. CrawlURI c = currentCuri; if (c != null) { w.print(" "); w.print(currentProcessorName); w.print(" "); w.print(c.toString()); w.print(" ("); w.print(c.getFetchAttempts()); w.print(") "); } else { w.print(" [no CrawlURI] "); } long now = System.currentTimeMillis(); long time = 0; if (lastFinishTime > lastStartTime) { // That means we finished something after we last started something // or in other words we are not working on anything. w.print("WAITING for "); time = now - lastFinishTime; } else if (lastStartTime > 0) { // We are working on something w.print("ACTIVE for "); time = now - lastStartTime; } w.print(ArchiveUtils.formatMillisecondsToConventional(time)); w.print(" at "); w.print(step); w.print(" for "); w.print(ArchiveUtils.formatMillisecondsToConventional(now - atStepSince)); w.print("\n"); w.flush(); }
protected CrawlURI peekItem(final WorkQueueFrontier frontier) throws IOException { final BdbMultipleWorkQueues queues = ((BdbFrontier) frontier).getWorkQueues(); DatabaseEntry key = new DatabaseEntry(origin); CrawlURI curi = null; int tries = 1; while (true) { try { curi = queues.get(key); } catch (DatabaseException e) { LOGGER.log(Level.SEVERE, "peekItem failure; retrying", e); } // ensure CrawlURI, if any, came from acceptable range: if (!ArchiveUtils.startsWith(key.getData(), origin)) { LOGGER.severe( "inconsistency: " + classKey + "(" + getPrefixClassKey(origin) + ") with " + getCount() + " items gave " + curi + "(" + getPrefixClassKey(key.getData())); // clear curi to allow retry curi = null; // reset key to original origin for retry key.setData(origin); } if (curi != null) { // success break; } if (tries > 3) { LOGGER.severe("no item where expected in queue " + classKey); break; } tries++; LOGGER.severe( "Trying get #" + Integer.toString(tries) + " in queue " + classKey + " with " + getCount() + " items using key " + getPrefixClassKey(key.getData())); } return curi; }
/** * Populates a given StoredSortedMap (history map) from an old environment db or a persist log. If * a map is not provided, only logs the entries that would have been populated. * * @param sourceFile source of old entries: can be a path to an existing environment db or persist * log * @param historyMap map to populate (or null for a dry run) * @return number of records * @throws DatabaseException * @throws IOException */ public static int copyPersistSourceToHistoryMap( File sourceFile, StoredSortedMap<String, Map> historyMap) throws DatabaseException, IOException { // delegate depending on the source if (sourceFile.isDirectory()) { return copyPersistEnv(sourceFile, historyMap); } else { BufferedReader persistLogReader = ArchiveUtils.getBufferedReader(sourceFile); return populatePersistEnvFromLog(persistLogReader, historyMap); } }
/** * Populates an environment db from a persist log. If historyMap is not provided, only logs the * entries that would have been populated. * * @param persistLogReader persist log * @param historyMap new environment db (or null for a dry run) * @return number of records * @throws UnsupportedEncodingException * @throws DatabaseException */ private static int populatePersistEnvFromLog( BufferedReader persistLogReader, StoredSortedMap<String, Map> historyMap) throws UnsupportedEncodingException, DatabaseException { int count = 0; Iterator<String> iter = new LineReadingIterator(persistLogReader); while (iter.hasNext()) { String line = iter.next(); if (line.length() == 0) { continue; } String[] splits = line.split(" "); if (splits.length != 2) { logger.severe("bad line has " + splits.length + " fields (should be 2): " + line); continue; } Map alist; try { alist = (Map) SerializationUtils.deserialize(Base64.decodeBase64(splits[1].getBytes("UTF-8"))); } catch (Exception e) { logger.severe("caught exception " + e + " deserializing line: " + line); continue; } if (logger.isLoggable(Level.FINE)) { logger.fine(splits[0] + " " + ArchiveUtils.prettyString(alist)); } if (historyMap != null) try { historyMap.put(splits[0], alist); } catch (Exception e) { logger.log( Level.SEVERE, "caught exception after loading " + count + " urls from the persist log (perhaps crawl was stopped by user?)", e); IOUtils.closeQuietly(persistLogReader); // seems to finish most cleanly when we return rather than throw something return count; } count++; } IOUtils.closeQuietly(persistLogReader); return count; }
/** * Calculate the insertKey that places a CrawlURI in the desired spot. First bytes are always * classKey (usu. host) based -- ensuring grouping by host -- terminated by a zero byte. Then 8 * bytes of data ensuring desired ordering within that 'queue' are used. The first byte of these 8 * is priority -- allowing 'immediate' and 'soon' items to sort above regular. Next 1 byte is * 'precedence'. Last 6 bytes are ordinal serial number, ensuring earlier-discovered URIs sort * before later. * * <p>NOTE: Dangers here are: (1) priorities or precedences over 2^7 (signed byte comparison) (2) * ordinals over 2^48 * * <p>Package access & static for testing purposes. * * @param curi * @return a DatabaseEntry key for the CrawlURI */ static DatabaseEntry calculateInsertKey(CrawlURI curi) { byte[] classKeyBytes = null; int len = 0; classKeyBytes = curi.getClassKey().getBytes(Charsets.UTF_8); len = classKeyBytes.length; byte[] keyData = new byte[len + 9]; System.arraycopy(classKeyBytes, 0, keyData, 0, len); keyData[len] = 0; long ordinalPlus = curi.getOrdinal() & 0x0000FFFFFFFFFFFFL; ordinalPlus = ((long) curi.getSchedulingDirective() << 56) | ordinalPlus; long precedence = Math.min(curi.getPrecedence(), 127); ordinalPlus = (((precedence) & 0xFFL) << 48) | ordinalPlus; ArchiveUtils.longIntoByteArray(ordinalPlus, keyData, len + 1); return new DatabaseEntry(keyData); }
private void seriousError(Error err) { // try to prevent timeslicing until we have a chance to deal with OOM // Note that modern-day JVM priority indifference with native threads // may make this priority-jumbling pointless setPriority(DEFAULT_PRIORITY + 1); if (controller != null) { // hold all ToeThreads from proceeding to next processor controller.freeReserveMemory(); controller.requestCrawlPause(); if (controller.getFrontier().getFrontierJournal() != null) { controller.getFrontier().getFrontierJournal().seriousError(getName() + err.getMessage()); } } // OutOfMemory etc. String extraInfo = DevUtils.extraInfo(); System.err.println("<<<"); System.err.println(ArchiveUtils.getLog17Date()); System.err.println(err); System.err.println(extraInfo); err.printStackTrace(System.err); if (controller != null) { PrintWriter pw = new PrintWriter(System.err); controller.getToePool().compactReportTo(pw); pw.flush(); } System.err.println(">>>"); // DevUtils.sigquitSelf(); String context = "unknown"; if (currentCuri != null) { // update fetch-status, saving original as annotation currentCuri.getAnnotations().add("err=" + err.getClass().getName()); currentCuri.getAnnotations().add("os" + currentCuri.getFetchStatus()); currentCuri.setFetchStatus(S_SERIOUS_ERROR); context = currentCuri.shortReportLine() + " in " + currentProcessorName; } String message = "Serious error occured trying " + "to process '" + context + "'\n" + extraInfo; logger.log(Level.SEVERE, message.toString(), err); setPriority(DEFAULT_PRIORITY); }
/** Explode the archive into its constituent elements */ public void explode() throws CacheException { int goodEntries = 0; int badEntries = 0; int entriesBetweenSleep = 0; ArchiveReader arcReader = null; logger.debug( (storeArchive ? "Storing" : "Fetching") + " WARC file: " + origUrl + " will explode"); try { // Wrap it in an ArchiveReader logger.debug3("About to wrap stream"); arcReader = wrapStream(fetchUrl, arcStream); logger.debug3("wrapStream() returns " + (arcReader == null ? "null" : "non-null")); // Explode it if (arcReader == null) { throw new CacheException.ExploderException("no WarcReader for " + origUrl); } ArchivalUnit au = crawlFacade.getAu(); logger.debug("Exploding " + fetchUrl); // Iterate through the elements in the WARC file, except the first Iterator<ArchiveRecord> iter = arcReader.iterator(); // Skip first record if (iter.hasNext()) iter.next(); while (iter.hasNext()) { helper.pokeWDog(); // check need to pause handlePause(++entriesBetweenSleep); // handle each element in the archive ArchiveRecord element = iter.next(); // Each element is a URL to be cached in our AU ArchiveRecordHeader elementHeader = element.getHeader(); String elementUrl = elementHeader.getUrl(); String elementMimeType = elementHeader.getMimetype(); long elementLength = elementHeader.getLength(); long elementDate; try { elementDate = ArchiveUtils.parse14DigitDate(elementHeader.getDate()).getTime(); } catch (ParseException e) { elementDate = 0; } logger.debug2("WARC url " + elementUrl + " mime " + elementMimeType); // add check to determine if this is a url which should be cached if (au.shouldBeCached(elementUrl) && elementUrl.startsWith("http:")) { ArchiveEntry ae = new ArchiveEntry( elementUrl, elementLength, elementDate, element, // ArchiveRecord extends InputStream this, fetchUrl); ae.setHeaderFields(makeCIProperties(elementHeader)); long bytesStored = elementLength; logger.debug3("ArchiveEntry: " + ae.getName() + " bytes " + bytesStored); try { helper.process(ae); } catch (PluginException ex) { throw new CacheException.ExploderException("helper.process() threw", ex); } if (ae.getBaseUrl() != null) { if (ae.getRestOfUrl() != null && ae.getHeaderFields() != null) { storeEntry(ae); handleAddText(ae); goodEntries++; // this needs to use the correct depth ? how CrawlUrlData cud = new CrawlUrlData(elementUrl, 0); crawlFacade.addToParseQueue(cud); crawlFacade.getCrawlerStatus().addContentBytesFetched(bytesStored); } } else { badEntries++; logger.debug2("Can't map " + elementUrl + " from " + archiveUrl); } } } } catch (IOException ex) { throw new CacheException.ExploderException(ex); } finally { if (arcReader != null) { try { arcReader.close(); } catch (IOException ex) { throw new CacheException.ExploderException(ex); } } IOUtil.safeClose(arcStream); } // report failed fetches if (badEntries != 0) { String msg = archiveUrl + ": " + badEntries + "/" + goodEntries + " bad entries"; throw new CacheException.UnretryableException(msg); } }
public String shortReportLine() { return ArchiveUtils.shortReportLine(this); }
/** * (non-Javadoc) * * @see java.lang.Thread#run() */ public void run() { String name = controller.getMetadata().getJobName(); logger.fine(getName() + " started for order '" + name + "'"); Recorder.setHttpRecorder(httpRecorder); try { while (true) { ArchiveUtils.continueCheck(); setStep(Step.ABOUT_TO_GET_URI, null); CrawlURI curi = controller.getFrontier().next(); synchronized (this) { ArchiveUtils.continueCheck(); setCurrentCuri(curi); currentCuri.setThreadNumber(this.serialNumber); lastStartTime = System.currentTimeMillis(); currentCuri.setRecorder(httpRecorder); } try { KeyedProperties.loadOverridesFrom(curi); controller.getFetchChain().process(curi, this); controller.getFrontier().beginDisposition(curi); controller.getDispositionChain().process(curi, this); } catch (RuntimeExceptionWrapper e) { // Workaround to get cause from BDB if (e.getCause() == null) { e.initCause(e.getCause()); } recoverableProblem(e); } catch (AssertionError ae) { // This risks leaving crawl in fatally inconsistent state, // but is often reasonable for per-Processor assertion problems recoverableProblem(ae); } catch (RuntimeException e) { recoverableProblem(e); } catch (InterruptedException e) { if (currentCuri != null) { recoverableProblem(e); Thread.interrupted(); // clear interrupt status } else { throw e; } } catch (StackOverflowError err) { recoverableProblem(err); } catch (Error err) { // OutOfMemory and any others seriousError(err); } finally { KeyedProperties.clearOverridesFrom(curi); } setStep(Step.ABOUT_TO_RETURN_URI, null); ArchiveUtils.continueCheck(); synchronized (this) { controller.getFrontier().finished(currentCuri); controller.getFrontier().endDisposition(); setCurrentCuri(null); } setStep(Step.FINISHING_PROCESS, null); lastFinishTime = System.currentTimeMillis(); if (shouldRetire) { break; // from while(true) } } } catch (InterruptedException e) { if (currentCuri != null) { logger.log( Level.SEVERE, "Interrupt leaving unfinished CrawlURI " + getName() + " - job may hang", e); } // thread interrupted, ok to end logger.log(Level.FINE, this.getName() + " ended with Interruption"); } catch (Exception e) { // everything else (including interruption) logger.log(Level.SEVERE, "Fatal exception in " + getName(), e); } catch (OutOfMemoryError err) { seriousError(err); } finally { controller.getFrontier().endDisposition(); } setCurrentCuri(null); // Do cleanup so that objects can be GC. this.httpRecorder.closeRecorders(); this.httpRecorder = null; logger.fine(getName() + " finished for order '" + name + "'"); setStep(Step.FINISHED, null); controller = null; }
/** * One independent queue of items with the same 'classKey' (eg host). * * @author gojomo */ public class BdbWorkQueue extends WorkQueue implements Comparable, Serializable { private static Logger LOGGER = Logger.getLogger(BdbWorkQueue.class.getName()); // be robust against trivial implementation changes private static final long serialVersionUID = ArchiveUtils.classnameBasedUID(BdbWorkQueue.class, 1); /** All items in this queue have this same 'origin' prefix to their keys. */ private byte[] origin; /** * Create a virtual queue inside the given BdbMultipleWorkQueues * * @param classKey */ public BdbWorkQueue(String classKey, BdbFrontier frontier) { super(classKey); this.origin = BdbMultipleWorkQueues.calculateOriginKey(classKey); if (LOGGER.isLoggable(Level.FINE)) { LOGGER.fine(getPrefixClassKey(this.origin) + " " + classKey); } // add the queue-front 'cap' entry; see... // http://sourceforge.net/tracker/index.php?func=detail&aid=1262665&group_id=73833&atid=539102 frontier.getWorkQueues().addCap(origin); } protected long deleteMatchingFromQueue(final WorkQueueFrontier frontier, final String match) throws IOException { try { final BdbMultipleWorkQueues queues = ((BdbFrontier) frontier).getWorkQueues(); return queues.deleteMatchingFromQueue(match, classKey, new DatabaseEntry(origin)); } catch (DatabaseException e) { throw IoUtils.wrapAsIOException(e); } } protected void deleteItem(final WorkQueueFrontier frontier, final CrawlURI peekItem) throws IOException { try { final BdbMultipleWorkQueues queues = ((BdbFrontier) frontier).getWorkQueues(); queues.delete(peekItem); } catch (DatabaseException e) { e.printStackTrace(); throw IoUtils.wrapAsIOException(e); } } protected CrawlURI peekItem(final WorkQueueFrontier frontier) throws IOException { final BdbMultipleWorkQueues queues = ((BdbFrontier) frontier).getWorkQueues(); DatabaseEntry key = new DatabaseEntry(origin); CrawlURI curi = null; int tries = 1; while (true) { try { curi = queues.get(key); } catch (DatabaseException e) { LOGGER.log(Level.SEVERE, "peekItem failure; retrying", e); } // ensure CrawlURI, if any, came from acceptable range: if (!ArchiveUtils.startsWith(key.getData(), origin)) { LOGGER.severe( "inconsistency: " + classKey + "(" + getPrefixClassKey(origin) + ") with " + getCount() + " items gave " + curi + "(" + getPrefixClassKey(key.getData())); // clear curi to allow retry curi = null; // reset key to original origin for retry key.setData(origin); } if (curi != null) { // success break; } if (tries > 3) { LOGGER.severe("no item where expected in queue " + classKey); break; } tries++; LOGGER.severe( "Trying get #" + Integer.toString(tries) + " in queue " + classKey + " with " + getCount() + " items using key " + getPrefixClassKey(key.getData())); } return curi; } protected void insertItem( final WorkQueueFrontier frontier, final CrawlURI curi, boolean overwriteIfPresent) throws IOException { try { final BdbMultipleWorkQueues queues = ((BdbFrontier) frontier).getWorkQueues(); queues.put(curi, overwriteIfPresent); if (LOGGER.isLoggable(Level.FINE)) { LOGGER.fine( "Inserted into " + getPrefixClassKey(this.origin) + " (count " + Long.toString(getCount()) + "): " + curi.toString()); } } catch (DatabaseException e) { throw IoUtils.wrapAsIOException(e); } } /** * @param byteArray Byte array to get hex string of. * @return Hex string of passed in byte array (Used logging key-prefixes). */ protected static String getPrefixClassKey(final byte[] byteArray) { int zeroIndex = 0; while (byteArray[zeroIndex] != 0) { zeroIndex++; } try { return new String(byteArray, 0, zeroIndex, "UTF-8"); } catch (UnsupportedEncodingException e) { // should be impossible; UTF-8 always available e.printStackTrace(); return e.getMessage(); } } }
/** * Populates a given StoredSortedMap (history map) from an old persist log. If a map is not * provided, only logs the entries that would have been populated. * * @param sourceUrl url of source persist log * @param historyMap map to populate (or null for a dry run) * @return number of records * @throws DatabaseException * @throws IOException */ public static int copyPersistSourceToHistoryMap( URL sourceUrl, StoredSortedMap<String, Map> historyMap) throws DatabaseException, IOException { BufferedReader persistLogReader = ArchiveUtils.getBufferedReader(sourceUrl); return populatePersistEnvFromLog(persistLogReader, historyMap); }
@Override public void map( Text key, WritableArchiveRecord value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { ArchiveRecord record = value.getRecord(); ArchiveRecordHeader header = record.getHeader(); // Logging for debug info: log.debug( "Processing @" + header.getOffset() + "+" + record.available() + "," + header.getLength() + ": " + header.getUrl()); for (String h : header.getHeaderFields().keySet()) { log.debug("ArchiveHeader: " + h + " -> " + header.getHeaderValue(h)); } try { MDX mdx = new MDX(); Date crawl_date = ArchiveUtils.parse14DigitISODate(header.getDate(), null); if (crawl_date != null) { mdx.setTs(ArchiveUtils.get14DigitDate(crawl_date)); } else { mdx.setTs(header.getDate()); } mdx.setUrl(header.getUrl()); mdx.setHash(header.getDigest()); // Data from WARC record: mdx.put("source-file", key.toString()); mdx.put("content-type", header.getMimetype()); mdx.put("content-length", "" + header.getContentLength()); mdx.put("length", "" + header.getLength()); mdx.put("source-offset", "" + header.getOffset()); mdx.put("record-identifier", header.getRecordIdentifier()); for (String k : header.getHeaderFieldKeys()) { mdx.put("HEADER-" + k, "" + header.getHeaderValue(k)); } // check record type and look for HTTP data: Header[] httpHeaders = null; if (record instanceof WARCRecord) { mdx.setRecordType("warc." + header.getHeaderValue(HEADER_KEY_TYPE)); mdx.setHash("" + header.getHeaderValue(WARCConstants.HEADER_KEY_PAYLOAD_DIGEST)); // There are not always headers! The code should check first. String statusLine = HttpParser.readLine(record, "UTF-8"); if (statusLine != null && statusLine.startsWith("HTTP")) { String firstLine[] = statusLine.split(" "); if (firstLine.length > 1) { String statusCode = firstLine[1].trim(); mdx.put("status-code", statusCode); try { httpHeaders = HttpParser.parseHeaders(record, "UTF-8"); } catch (ProtocolException p) { log.error( "ProtocolException [" + statusCode + "]: " + header.getHeaderValue(WARCConstants.HEADER_KEY_FILENAME) + "@" + header.getHeaderValue(WARCConstants.ABSOLUTE_OFFSET_KEY), p); } } else { log.warn("Could not parse status line: " + statusLine); } } else { log.warn( "Invalid status line: " + header.getHeaderValue(WARCConstants.HEADER_KEY_FILENAME) + "@" + header.getHeaderValue(WARCConstants.ABSOLUTE_OFFSET_KEY)); } } else if (record instanceof ARCRecord) { mdx.setRecordType("arc"); ARCRecord arcr = (ARCRecord) record; mdx.put("status-code", "" + arcr.getStatusCode()); httpHeaders = arcr.getHttpHeaders(); } else { mdx.setRecordType("unknown"); } // Add in http headers if (httpHeaders != null) { for (Header h : httpHeaders) { mdx.put("HTTP-" + h.getName(), h.getValue()); } } // URL: String uri = header.getUrl(); if (uri != null) { UsableURI uuri = UsableURIFactory.getInstance(uri); // Hosts: if ("https".contains(uuri.getScheme())) { mdx.put("host", uuri.getAuthority()); } } else { mdx.put("errors", "malformed-url"); } // Year String date = header.getDate(); if (date != null && date.length() > 4) { mdx.put("year", date.substring(0, 4)); } else { mdx.put("errors", "malformed-date"); } // And collect: String outKey = mdx.getHash(); if (outKey == null || outKey == "" || "null".equals(outKey)) { outKey = mdx.getRecordType() + ":" + header.getMimetype(); } else { outKey = mdx.getRecordType() + ":" + outKey; } output.collect(new Text(outKey), new Text(mdx.toString())); } catch (JSONException e) { // TODO Auto-generated catch block e.printStackTrace(); } }
public void _jspService(HttpServletRequest request, HttpServletResponse response) throws java.io.IOException, ServletException { JspFactory _jspxFactory = null; javax.servlet.jsp.PageContext pageContext = null; HttpSession session = null; ServletContext application = null; ServletConfig config = null; JspWriter out = null; Object page = this; JspWriter _jspx_out = null; try { _jspxFactory = JspFactory.getDefaultFactory(); response.setContentType("text/html; charset=UTF-8"); pageContext = _jspxFactory.getPageContext(this, request, response, "/error.jsp", true, 8192, true); application = pageContext.getServletContext(); config = pageContext.getServletConfig(); session = pageContext.getSession(); out = pageContext.getOut(); _jspx_out = out; out.write("\n"); out.write("\n"); out.write("\n"); out.write("\n"); /** This include page ensures that the handler exists and is ready to be accessed. */ CrawlJobHandler handler = (CrawlJobHandler) application.getAttribute("handler"); Heritrix heritrix = (Heritrix) application.getAttribute("heritrix"); // If handler is empty then this is the first time this bit of code is // being run since the server came online. In that case get or create the // handler. if (handler == null) { if (Heritrix.isSingleInstance()) { heritrix = Heritrix.getSingleInstance(); handler = heritrix.getJobHandler(); application.setAttribute("heritrix", heritrix); application.setAttribute("handler", handler); } else { // TODO: // If we get here, then there are multiple heritrix instances // and we have to put up a screen allowing the user choose between. // Otherwise, there is no Heritrix instance. Thats a problem. throw new RuntimeException( "No heritrix instance (or multiple " + "to choose from and we haven't implemented this yet)"); } } // ensure controller's settingsHandler is always thread-installed // in web ui threads if (handler != null) { CrawlJob job = handler.getCurrentJob(); if (job != null) { CrawlController controller = job.getController(); if (controller != null) { controller.installThreadContextSettingsHandler(); } } } out.write("\n"); out.write("\n\n"); String title = "Help"; int tab = 6; out.write("\n\n"); out.write("\n"); out.write("\n"); out.write("\n"); out.write("\n"); out.write("\n"); out.write("\n"); out.write("\n"); out.write("\n"); out.write("\n"); out.write("\n"); out.write("\n"); String currentHeritrixName = (heritrix == null) ? "No current Heritrix instance" : (heritrix.getMBeanName() == null) ? heritrix.getInstances().keySet().iterator().next().toString() : heritrix.getMBeanName().toString(); /** * An include file that handles the "look" and navigation of a web page. Include at top (where * you would normally begin the HTML code). If used, the include "foot.jsp" should be included * at the end of the HTML code. It will close any table, body and html tags left open in this * one. Any custom HTML code is thus placed between the two. * * <p>The following variables must exist prior to this file being included: * * <p>String title - Title of the web page int tab - Which to display as 'selected'. 0 - * Console 1 - Jobs 2 - Profiles 3 - Logs 4 - Reports 5 - Settings 6 - Help * * <p>SimpleHandler handler - In general this is provided by the include page 'handler.jsp' * which should be included prior to this one. * * @author Kristinn Sigurdsson */ String shortJobStatus = null; if (handler.getCurrentJob() != null) { shortJobStatus = TextUtils.getFirstWord(handler.getCurrentJob().getStatus()); } String favicon = System.getProperties().getProperty("heritrix.favicon", "h.ico"); out.write("\n"); StatisticsTracker stats = null; if (handler.getCurrentJob() != null) { // Assume that StatisticsTracker is being used. stats = (StatisticsTracker) handler.getCurrentJob().getStatisticsTracking(); } out.write("\n"); out.write("\n\n"); out.write("<html>\n "); out.write("<head>\n \t"); out.write( "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"/>\n "); out.write("<title>Heritrix: "); out.print(title); out.write("</title>\n "); out.write("<link rel=\"stylesheet\" \n href=\""); out.print(request.getContextPath()); out.write("/css/heritrix.css\">\n "); out.write("<link rel=\"icon\" href=\""); out.print(request.getContextPath()); out.write("/images/"); out.print(favicon); out.write("\" type=\"image/x-icon\" />\n "); out.write("<link rel=\"shortcut icon\" href=\""); out.print(request.getContextPath()); out.write("/images/"); out.print(favicon); out.write("\" type=\"image/x-icon\" />\n "); out.write("<script src=\"/js/util.js\">\n "); out.write("</script>\n "); out.write("</head>\n\n "); out.write("<body>\n "); out.write( "<table border=\"0\" cellspacing=\"0\" cellpadding=\"0\" width=\"100%\">\n "); out.write("<tr>\n "); out.write("<td>\n "); out.write( "<table border=\"0\" cellspacing=\"0\" cellpadding=\"0\" height=\"100%\">\n "); out.write("<tr>\n "); out.write( "<td height=\"60\" width=\"155\" valign=\"top\" nowrap>\n "); out.write( "<table border=\"0\" width=\"155\" cellspacing=\"0\" cellpadding=\"0\" height=\"60\">\n "); out.write("<tr>\n "); out.write( "<td align=\"center\" height=\"40\" valign=\"bottom\">\n "); out.write("<a border=\"0\" \n href=\""); out.print(request.getContextPath()); out.write("/index.jsp\">"); out.write("<img border=\"0\" src=\""); out.print(request.getContextPath()); out.write("/images/logo.gif\" height=\"37\" width=\"145\">"); out.write("</a>\n "); out.write("</td>\n "); out.write("</tr>\n "); out.write("<tr>\n "); out.write("<td class=\"subheading\">\n "); out.print(title); out.write("\n "); out.write("</td>\n "); out.write("</tr>\n "); out.write("</table>\n "); out.write("</td>\n "); out.write( "<td width=\"5\" nowrap>\n \n "); out.write("</td>\n "); out.write("<td width=\"460\" align=\"left\" nowrap>\n "); out.write( "<table border=\"0\" cellspacing=\"0\" cellpadding=\"0\" height=\"60\">\n "); out.write("<tr>\n "); out.write("<td colspan=\"2\" nowrap>\n "); SimpleDateFormat sdf = new SimpleDateFormat("MMM. d, yyyy HH:mm:ss"); sdf.setTimeZone(java.util.TimeZone.getTimeZone("GMT")); out.write("\n "); out.write("<b>\n Status as of "); out.write("<a style=\"color: #000000\" href=\""); out.print(request.getRequestURL()); out.write("\">"); out.print(sdf.format(new java.util.Date())); out.write(" GMT"); out.write("</a>\n "); out.write( "</b>\n \n "); out.write("<span style=\"text-align:right\">\n "); out.write( "<b>\n Alerts: \n "); out.write("</b>\n "); if (heritrix.getAlertsCount() == 0) { out.write("\n "); out.write("<a style=\"color: #000000; text-decoration: none\" href=\""); out.print(request.getContextPath()); out.write("/console/alerts.jsp\">no alerts"); out.write("</a>\n "); } else if (heritrix.getNewAlertsCount() > 0) { out.write("\n "); out.write("<b>"); out.write("<a href=\""); out.print(request.getContextPath()); out.write("/console/alerts.jsp\">"); out.print(heritrix.getAlerts().size()); out.write(" ("); out.print(heritrix.getNewAlertsCount()); out.write(" new)"); out.write("</a>"); out.write("</b>\n "); } else { out.write("\n "); out.write("<a style=\"color: #000000\" href=\""); out.print(request.getContextPath()); out.write("/console/alerts.jsp\">"); out.print(heritrix.getAlertsCount()); out.write(" ("); out.print(heritrix.getNewAlertsCount()); out.write(" new)"); out.write("</a>\n "); } out.write("\n "); out.write("</span>\n "); out.write("</td>\n "); out.write("</tr>\n "); out.write("<tr>\n "); out.write("<td valign=\"top\" nowrap>\n\t\t\t\t\t\t\t\t\t\t"); out.print( handler.isRunning() ? "<span class='status'>Crawling Jobs</span>" : "<span class='status'>Holding Jobs</span>"); out.write("<i> "); out.write("</i>\n\t\t\t\t\t\t\t\t\t\t"); out.write("</td>\n\t\t\t\t\t\t\t\t\t\t"); out.write("<td valign=\"top\" align=\"right\" nowrap>\n\t\t\t\t\t\t\t\t\t\t"); if (handler.isRunning() || handler.isCrawling()) { if (handler.getCurrentJob() != null) { out.write("\n\t\t\t\t\t\t\t\t\t\t"); out.write("<span class='status'>\n\t\t\t\t\t\t\t\t\t\t"); out.print(shortJobStatus); out.write("</span> job:\n\t\t\t\t\t\t\t\t\t\t"); out.write("<i>"); out.print(handler.getCurrentJob().getJobName()); out.write("</i>\n\t\t\t\t\t\t\t\t\t\t"); } else { out.println("No job ready <a href=\""); out.println(request.getContextPath()); out.println("/jobs.jsp\" style='color: #000000'>(create new)</a>"); } } out.write("\n\t\t\t\t\t\t\t\t\t\t"); out.write("</td>\n "); out.write("</tr>\n "); out.write("<tr>\n "); out.write("<td nowrap>\n "); out.print(handler.getPendingJobs().size()); out.write( "\n jobs\n "); out.write("<a style=\"color: #000000\" href=\""); out.print(request.getContextPath()); out.write("/jobs.jsp#pending\">pending"); out.write("</a>,\n "); out.print(handler.getCompletedJobs().size()); out.write("\n "); out.write("<a style=\"color: #000000\" href=\""); out.print(request.getContextPath()); out.write("/jobs.jsp#completed\">completed"); out.write( "</a>\n \n "); out.write("</td>\n "); out.write("<td nowrap align=\"right\">\n "); if (handler.isCrawling()) { out.write("\n "); out.print((stats != null) ? stats.successfullyFetchedCount() : 0); out.write(" URIs in \n\t\t "); out.print( ArchiveUtils.formatMillisecondsToConventional( ((stats != null) ? (stats.getCrawlerTotalElapsedTime()) : 0), false)); out.write("\n\t\t ("); out.print( ArchiveUtils.doubleToString( ((stats != null) ? stats.currentProcessedDocsPerSec() : 0), 2)); out.write("/sec)\n "); } out.write("\n "); out.write("</td>\n "); out.write("</tr>\n "); out.write("</table>\n "); out.write("</td>\n "); out.write("</tr>\n "); out.write("</table>\n "); out.write("</td>\n "); out.write("<td width=\"100%\" nowrap>\n \n "); out.write("</td>\n "); out.write("</tr>\n "); out.write("<tr>\n "); out.write("<td bgcolor=\"#0000FF\" height=\"1\" colspan=\"4\">\n "); out.write("</td>\n "); out.write("</tr>\n "); out.write("<tr>\n "); out.write("<td colspan=\"4\" height=\"20\">\n "); out.write( "<table border=\"0\" cellspacing=\"0\" cellpadding=\"0\" width=\"100%\" height=\"20\">\n "); out.write("<tr>\n "); out.write("<td class=\"tab_seperator\"> "); out.write("</td>\n "); out.write("<td class=\"tab"); out.print(tab == 0 ? "_selected" : ""); out.write("\">\n "); out.write("<a href=\""); out.print(request.getContextPath()); out.write("/index.jsp\" class=\"tab_text"); out.print(tab == 0 ? "_selected" : ""); out.write("\">Console"); out.write("</a>\n "); out.write("</td>\n "); out.write("<td class=\"tab_seperator\"> "); out.write("</td>\n "); out.write("<td class=\"tab"); out.print(tab == 1 ? "_selected" : ""); out.write("\">\n "); out.write("<a href=\""); out.print(request.getContextPath()); out.write("/jobs.jsp\" class=\"tab_text"); out.print(tab == 1 ? "_selected" : ""); out.write("\">Jobs"); out.write("</a>\n "); out.write("</td>\n "); out.write("<td class=\"tab_seperator\"> "); out.write("</td>\n "); out.write("<td class=\"tab"); out.print(tab == 2 ? "_selected" : ""); out.write("\">\n "); out.write("<a href=\""); out.print(request.getContextPath()); out.write("/profiles.jsp\" class=\"tab_text"); out.print(tab == 2 ? "_selected" : ""); out.write("\">Profiles"); out.write("</a>\n "); out.write("</td>\n "); out.write("<td class=\"tab_seperator\"> "); out.write("</td>\n "); out.write("<td class=\"tab"); out.print(tab == 3 ? "_selected" : ""); out.write("\">\n "); out.write("<a href=\""); out.print(request.getContextPath()); out.write("/logs.jsp\" class=\"tab_text"); out.print(tab == 3 ? "_selected" : ""); out.write("\">Logs"); out.write("</a>\n "); out.write("</td>\n "); out.write("<td class=\"tab_seperator\"> "); out.write("</td>\n "); out.write("<td class=\"tab"); out.print(tab == 4 ? "_selected" : ""); out.write("\">\n "); out.write("<a href=\""); out.print(request.getContextPath()); out.write("/reports.jsp\" class=\"tab_text"); out.print(tab == 4 ? "_selected" : ""); out.write("\">Reports"); out.write("</a>\n "); out.write("</td>\n "); out.write("<td class=\"tab_seperator\"> "); out.write("</td>\n "); out.write("<td class=\"tab"); out.print(tab == 5 ? "_selected" : ""); out.write("\">\n "); out.write("<a href=\""); out.print(request.getContextPath()); out.write("/setup.jsp\" class=\"tab_text"); out.print(tab == 5 ? "_selected" : ""); out.write("\">Setup"); out.write("</a>\n "); out.write("</td>\n "); out.write("<td class=\"tab_seperator\"> "); out.write("</td>\n "); out.write("<td class=\"tab"); out.print(tab == 6 ? "_selected" : ""); out.write("\">\n "); out.write("<a href=\""); out.print(request.getContextPath()); out.write("/help.jsp\" class=\"tab_text"); out.print(tab == 6 ? "_selected" : ""); out.write("\">Help"); out.write("</a>\n "); out.write("</td>\n "); out.write("<td width=\"100%\">\n "); out.write("</td>\n "); out.write("</tr>\n "); out.write("</table>\n "); out.write("</td>\n "); out.write("</tr>\n "); out.write("<tr>\n "); out.write("<td bgcolor=\"#0000FF\" height=\"1\" colspan=\"4\">"); out.write("</td>\n "); out.write("</tr>\n "); out.write("</table>\n "); out.write("<!-- MAIN BODY -->\n"); out.write("\n\n"); out.write("<div class=\"margined\">\n "); out.write("<h1>Heritrix online help"); out.write("</h1>\n"); out.write("<p>\n "); out.write("<b>"); out.write("<a href=\""); out.print(request.getContextPath()); out.write("/about.jsp\">About Heritrix"); out.write("</a>"); out.write("</b>"); out.write("</br>\n Includes license and current environment information.\n"); out.write("</p>\n"); out.write("<p>\n "); out.write("<b>"); out.write("<a target=\"_blank\" \n href=\""); out.print(request.getContextPath()); out.write("/docs/articles/user_manual/index.html\">User\n Manual"); out.write("</a>"); out.write("</b>"); out.write( "<br> Covers creating, configuring, launching,\n monitoring and analysing crawl jobs. For all users.\n"); out.write("</p>\n"); out.write("<p>\n "); out.write("<b>"); out.write("<a target=\"_blank\" \n href=\""); out.print(request.getContextPath()); out.write("/docs/articles/developer_manual/index.html\">Developer Manual"); out.write("</a>"); out.write("</b>"); out.write( "<br> Covers how to write add on modules for Heritrix\n and provides in depth coverage of Heritrix's architecture. For\n advanced users.\n"); out.write("</p>\n"); out.write("<p>\n "); out.write("<b>"); out.write("<a target=\"_blank\" \n href=\""); out.print(request.getContextPath()); out.write("/docs/articles/releasenotes/index.html\">Release Notes"); out.write("</a>"); out.write("</b>"); out.write("<br>\n"); out.write("</p>\n"); out.write("<p>\n\t"); out.write("<b>"); out.write( "<a href=\"http://crawler.archive.org/issue-tracking.html\" target=\"_blank\">Issue Tracking"); out.write("</a>"); out.write("</b>"); out.write( "<br />\n\tIf you have found a bug or would like to see new features in Heritrix, check the following links:\n\t"); out.write("<ul>\n\t\t"); out.write("<li>"); out.write( "<a href=\"http://sourceforge.net/tracker/?atid=539099&group_id=73833&func=browse\" target=\"_blank\">Bugs"); out.write("</a>"); out.write("</li>\n\t\t"); out.write("<li>"); out.write( "<a href=\"http://sourceforge.net/tracker/?atid=539102&group_id=73833&func=browse\" target=\"_blank\">Feature Requests"); out.write("</a>"); out.write("</li>\n\t"); out.write("</ul>\n"); out.write("</p>\n"); out.write("<p>\n "); out.write("<b>"); out.write( "<a href=\"http://crawler.archive.org/mail-lists.html\" target=\"_blank\">Mailing Lists"); out.write("</a>"); out.write("</b>"); out.write("<br />\n For general discussion on Heritrix, use our "); out.write( "<a href=\"http://groups.yahoo.com/group/archive-crawler/\" target=\"_blank\">Crawler Discussion List"); out.write("</a>.\n"); out.write("</p>\n"); out.write("<p>\n "); out.write("<b>"); out.write("<a href=\""); out.print(request.getContextPath()); out.write("/help/regexpr.jsp\">Regular Expressions"); out.write("</a>"); out.write("</b>"); out.write( "<br />\n Information about the regular expressions used in Heritrix and a tool to double check that your regular expressions are valid and that they correctly identify the desired strings.\n"); out.write("</p>\n"); out.write("<p>\n "); out.write("<b>"); out.write("<a href=\""); out.print(request.getContextPath()); out.write("/help/codes.jsp\">URI Fetch Status Codes"); out.write("</a>"); out.write("</b>"); out.write( "<br />\n This reference details what each of the fetch status codes assigned to URIs means.\n"); out.write("</p>\n"); out.write("<hr />\n"); out.write("<font size=\"-1\">Heritrix version @VERSION@"); out.write("</font>\n"); out.write("</div>\n"); /** * An include file that handles the "look" and navigation of a web page. Wrapps up things * begun in the "head.jsp" include file. See it for more details. * * @author Kristinn Sigurdsson */ out.write("\n"); out.write("<br/>\n"); out.write("<br/>\n "); out.write( "<table border=\"0\" cellspacing=\"0\" cellpadding=\"0\" width=\"100%\">\n "); out.write("<tr>\n "); out.write("<td bgcolor=\"#0000FF\" height=\"1\" colspan=\"4\">"); out.write("</td>\n "); out.write("</tr>\n "); out.write("<tr>\n "); out.write("<td class=\"instance_name\">Identifier: "); out.print(currentHeritrixName); out.write("</td>\n "); out.write("</tr>\n "); out.write("</table>\n "); out.write("<!-- END MAIN BODY -->\n "); out.write("</body>\n"); out.write("</html>"); out.write("\n"); } catch (Throwable t) { out = _jspx_out; if (out != null && out.getBufferSize() != 0) out.clearBuffer(); if (pageContext != null) pageContext.handlePageException(t); } finally { if (_jspxFactory != null) _jspxFactory.releasePageContext(pageContext); } }