protected void startHeritrix(String path) throws Exception { String authPassword = (new BigInteger(SecureRandom.getSeed(16))).abs().toString(16); String[] args = {"-j", path + "/jobs", "-a", authPassword}; // TODO: add auth password? heritrix = new Heritrix(); heritrix.instanceMain(args); configureHeritrix(); heritrix.getEngine().requestLaunch("selftest-job"); }
/** * Returns a CrawlJobHandler from the Heritrix instance, if a heritrix instance can be found. * * @return a CrawlJobHandler from the Heritrix instance, if a heritrix instance can be found. * @throws Exception if no Heritrix is available. */ public CrawlJobHandler initHandler() throws Exception { CrawlJobHandler handler; if (Heritrix.isSingleInstance()) { Heritrix heritrix = Heritrix.getSingleInstance(); handler = heritrix.getJobHandler(); } else { throw new Exception("No heritrix instance"); } return handler; } // - initHandler
protected void waitForCrawlFinish() throws Exception { heritrix.getEngine().waitForNoRunningJobs(0); }
protected void stopHeritrix() throws Exception { heritrix.getEngine().shutdown(); heritrix.getComponent().stop(); }
public void _jspService(HttpServletRequest request, HttpServletResponse response) throws java.io.IOException, ServletException { JspFactory _jspxFactory = null; javax.servlet.jsp.PageContext pageContext = null; HttpSession session = null; ServletContext application = null; ServletConfig config = null; JspWriter out = null; Object page = this; JspWriter _jspx_out = null; try { _jspxFactory = JspFactory.getDefaultFactory(); response.setContentType("text/html; charset=UTF-8"); pageContext = _jspxFactory.getPageContext(this, request, response, "/error.jsp", true, 8192, true); application = pageContext.getServletContext(); config = pageContext.getServletConfig(); session = pageContext.getSession(); out = pageContext.getOut(); _jspx_out = out; out.write("\n"); out.write("\n"); out.write("\n"); out.write("\n"); /** This include page ensures that the handler exists and is ready to be accessed. */ CrawlJobHandler handler = (CrawlJobHandler) application.getAttribute("handler"); Heritrix heritrix = (Heritrix) application.getAttribute("heritrix"); // If handler is empty then this is the first time this bit of code is // being run since the server came online. In that case get or create the // handler. if (handler == null) { if (Heritrix.isSingleInstance()) { heritrix = Heritrix.getSingleInstance(); handler = heritrix.getJobHandler(); application.setAttribute("heritrix", heritrix); application.setAttribute("handler", handler); } else { // TODO: // If we get here, then there are multiple heritrix instances // and we have to put up a screen allowing the user choose between. // Otherwise, there is no Heritrix instance. Thats a problem. throw new RuntimeException( "No heritrix instance (or multiple " + "to choose from and we haven't implemented this yet)"); } } // ensure controller's settingsHandler is always thread-installed // in web ui threads if (handler != null) { CrawlJob job = handler.getCurrentJob(); if (job != null) { CrawlController controller = job.getController(); if (controller != null) { controller.installThreadContextSettingsHandler(); } } } out.write("\n"); out.write("\n\n"); String title = "Help"; int tab = 6; out.write("\n\n"); out.write("\n"); out.write("\n"); out.write("\n"); out.write("\n"); out.write("\n"); out.write("\n"); out.write("\n"); out.write("\n"); out.write("\n"); out.write("\n"); out.write("\n"); String currentHeritrixName = (heritrix == null) ? "No current Heritrix instance" : (heritrix.getMBeanName() == null) ? heritrix.getInstances().keySet().iterator().next().toString() : heritrix.getMBeanName().toString(); /** * An include file that handles the "look" and navigation of a web page. Include at top (where * you would normally begin the HTML code). If used, the include "foot.jsp" should be included * at the end of the HTML code. It will close any table, body and html tags left open in this * one. Any custom HTML code is thus placed between the two. * * <p>The following variables must exist prior to this file being included: * * <p>String title - Title of the web page int tab - Which to display as 'selected'. 0 - * Console 1 - Jobs 2 - Profiles 3 - Logs 4 - Reports 5 - Settings 6 - Help * * <p>SimpleHandler handler - In general this is provided by the include page 'handler.jsp' * which should be included prior to this one. * * @author Kristinn Sigurdsson */ String shortJobStatus = null; if (handler.getCurrentJob() != null) { shortJobStatus = TextUtils.getFirstWord(handler.getCurrentJob().getStatus()); } String favicon = System.getProperties().getProperty("heritrix.favicon", "h.ico"); out.write("\n"); StatisticsTracker stats = null; if (handler.getCurrentJob() != null) { // Assume that StatisticsTracker is being used. stats = (StatisticsTracker) handler.getCurrentJob().getStatisticsTracking(); } out.write("\n"); out.write("\n\n"); out.write("<html>\n "); out.write("<head>\n \t"); out.write( "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"/>\n "); out.write("<title>Heritrix: "); out.print(title); out.write("</title>\n "); out.write("<link rel=\"stylesheet\" \n href=\""); out.print(request.getContextPath()); out.write("/css/heritrix.css\">\n "); out.write("<link rel=\"icon\" href=\""); out.print(request.getContextPath()); out.write("/images/"); out.print(favicon); out.write("\" type=\"image/x-icon\" />\n "); out.write("<link rel=\"shortcut icon\" href=\""); out.print(request.getContextPath()); out.write("/images/"); out.print(favicon); out.write("\" type=\"image/x-icon\" />\n "); out.write("<script src=\"/js/util.js\">\n "); out.write("</script>\n "); out.write("</head>\n\n "); out.write("<body>\n "); out.write( "<table border=\"0\" cellspacing=\"0\" cellpadding=\"0\" width=\"100%\">\n "); out.write("<tr>\n "); out.write("<td>\n "); out.write( "<table border=\"0\" cellspacing=\"0\" cellpadding=\"0\" height=\"100%\">\n "); out.write("<tr>\n "); out.write( "<td height=\"60\" width=\"155\" valign=\"top\" nowrap>\n "); out.write( "<table border=\"0\" width=\"155\" cellspacing=\"0\" cellpadding=\"0\" height=\"60\">\n "); out.write("<tr>\n "); out.write( "<td align=\"center\" height=\"40\" valign=\"bottom\">\n "); out.write("<a border=\"0\" \n href=\""); out.print(request.getContextPath()); out.write("/index.jsp\">"); out.write("<img border=\"0\" src=\""); out.print(request.getContextPath()); out.write("/images/logo.gif\" height=\"37\" width=\"145\">"); out.write("</a>\n "); out.write("</td>\n "); out.write("</tr>\n "); out.write("<tr>\n "); out.write("<td class=\"subheading\">\n "); out.print(title); out.write("\n "); out.write("</td>\n "); out.write("</tr>\n "); out.write("</table>\n "); out.write("</td>\n "); out.write( "<td width=\"5\" nowrap>\n \n "); out.write("</td>\n "); out.write("<td width=\"460\" align=\"left\" nowrap>\n "); out.write( "<table border=\"0\" cellspacing=\"0\" cellpadding=\"0\" height=\"60\">\n "); out.write("<tr>\n "); out.write("<td colspan=\"2\" nowrap>\n "); SimpleDateFormat sdf = new SimpleDateFormat("MMM. d, yyyy HH:mm:ss"); sdf.setTimeZone(java.util.TimeZone.getTimeZone("GMT")); out.write("\n "); out.write("<b>\n Status as of "); out.write("<a style=\"color: #000000\" href=\""); out.print(request.getRequestURL()); out.write("\">"); out.print(sdf.format(new java.util.Date())); out.write(" GMT"); out.write("</a>\n "); out.write( "</b>\n \n "); out.write("<span style=\"text-align:right\">\n "); out.write( "<b>\n Alerts: \n "); out.write("</b>\n "); if (heritrix.getAlertsCount() == 0) { out.write("\n "); out.write("<a style=\"color: #000000; text-decoration: none\" href=\""); out.print(request.getContextPath()); out.write("/console/alerts.jsp\">no alerts"); out.write("</a>\n "); } else if (heritrix.getNewAlertsCount() > 0) { out.write("\n "); out.write("<b>"); out.write("<a href=\""); out.print(request.getContextPath()); out.write("/console/alerts.jsp\">"); out.print(heritrix.getAlerts().size()); out.write(" ("); out.print(heritrix.getNewAlertsCount()); out.write(" new)"); out.write("</a>"); out.write("</b>\n "); } else { out.write("\n "); out.write("<a style=\"color: #000000\" href=\""); out.print(request.getContextPath()); out.write("/console/alerts.jsp\">"); out.print(heritrix.getAlertsCount()); out.write(" ("); out.print(heritrix.getNewAlertsCount()); out.write(" new)"); out.write("</a>\n "); } out.write("\n "); out.write("</span>\n "); out.write("</td>\n "); out.write("</tr>\n "); out.write("<tr>\n "); out.write("<td valign=\"top\" nowrap>\n\t\t\t\t\t\t\t\t\t\t"); out.print( handler.isRunning() ? "<span class='status'>Crawling Jobs</span>" : "<span class='status'>Holding Jobs</span>"); out.write("<i> "); out.write("</i>\n\t\t\t\t\t\t\t\t\t\t"); out.write("</td>\n\t\t\t\t\t\t\t\t\t\t"); out.write("<td valign=\"top\" align=\"right\" nowrap>\n\t\t\t\t\t\t\t\t\t\t"); if (handler.isRunning() || handler.isCrawling()) { if (handler.getCurrentJob() != null) { out.write("\n\t\t\t\t\t\t\t\t\t\t"); out.write("<span class='status'>\n\t\t\t\t\t\t\t\t\t\t"); out.print(shortJobStatus); out.write("</span> job:\n\t\t\t\t\t\t\t\t\t\t"); out.write("<i>"); out.print(handler.getCurrentJob().getJobName()); out.write("</i>\n\t\t\t\t\t\t\t\t\t\t"); } else { out.println("No job ready <a href=\""); out.println(request.getContextPath()); out.println("/jobs.jsp\" style='color: #000000'>(create new)</a>"); } } out.write("\n\t\t\t\t\t\t\t\t\t\t"); out.write("</td>\n "); out.write("</tr>\n "); out.write("<tr>\n "); out.write("<td nowrap>\n "); out.print(handler.getPendingJobs().size()); out.write( "\n jobs\n "); out.write("<a style=\"color: #000000\" href=\""); out.print(request.getContextPath()); out.write("/jobs.jsp#pending\">pending"); out.write("</a>,\n "); out.print(handler.getCompletedJobs().size()); out.write("\n "); out.write("<a style=\"color: #000000\" href=\""); out.print(request.getContextPath()); out.write("/jobs.jsp#completed\">completed"); out.write( "</a>\n \n "); out.write("</td>\n "); out.write("<td nowrap align=\"right\">\n "); if (handler.isCrawling()) { out.write("\n "); out.print((stats != null) ? stats.successfullyFetchedCount() : 0); out.write(" URIs in \n\t\t "); out.print( ArchiveUtils.formatMillisecondsToConventional( ((stats != null) ? (stats.getCrawlerTotalElapsedTime()) : 0), false)); out.write("\n\t\t ("); out.print( ArchiveUtils.doubleToString( ((stats != null) ? stats.currentProcessedDocsPerSec() : 0), 2)); out.write("/sec)\n "); } out.write("\n "); out.write("</td>\n "); out.write("</tr>\n "); out.write("</table>\n "); out.write("</td>\n "); out.write("</tr>\n "); out.write("</table>\n "); out.write("</td>\n "); out.write("<td width=\"100%\" nowrap>\n \n "); out.write("</td>\n "); out.write("</tr>\n "); out.write("<tr>\n "); out.write("<td bgcolor=\"#0000FF\" height=\"1\" colspan=\"4\">\n "); out.write("</td>\n "); out.write("</tr>\n "); out.write("<tr>\n "); out.write("<td colspan=\"4\" height=\"20\">\n "); out.write( "<table border=\"0\" cellspacing=\"0\" cellpadding=\"0\" width=\"100%\" height=\"20\">\n "); out.write("<tr>\n "); out.write("<td class=\"tab_seperator\"> "); out.write("</td>\n "); out.write("<td class=\"tab"); out.print(tab == 0 ? "_selected" : ""); out.write("\">\n "); out.write("<a href=\""); out.print(request.getContextPath()); out.write("/index.jsp\" class=\"tab_text"); out.print(tab == 0 ? "_selected" : ""); out.write("\">Console"); out.write("</a>\n "); out.write("</td>\n "); out.write("<td class=\"tab_seperator\"> "); out.write("</td>\n "); out.write("<td class=\"tab"); out.print(tab == 1 ? "_selected" : ""); out.write("\">\n "); out.write("<a href=\""); out.print(request.getContextPath()); out.write("/jobs.jsp\" class=\"tab_text"); out.print(tab == 1 ? "_selected" : ""); out.write("\">Jobs"); out.write("</a>\n "); out.write("</td>\n "); out.write("<td class=\"tab_seperator\"> "); out.write("</td>\n "); out.write("<td class=\"tab"); out.print(tab == 2 ? "_selected" : ""); out.write("\">\n "); out.write("<a href=\""); out.print(request.getContextPath()); out.write("/profiles.jsp\" class=\"tab_text"); out.print(tab == 2 ? "_selected" : ""); out.write("\">Profiles"); out.write("</a>\n "); out.write("</td>\n "); out.write("<td class=\"tab_seperator\"> "); out.write("</td>\n "); out.write("<td class=\"tab"); out.print(tab == 3 ? "_selected" : ""); out.write("\">\n "); out.write("<a href=\""); out.print(request.getContextPath()); out.write("/logs.jsp\" class=\"tab_text"); out.print(tab == 3 ? "_selected" : ""); out.write("\">Logs"); out.write("</a>\n "); out.write("</td>\n "); out.write("<td class=\"tab_seperator\"> "); out.write("</td>\n "); out.write("<td class=\"tab"); out.print(tab == 4 ? "_selected" : ""); out.write("\">\n "); out.write("<a href=\""); out.print(request.getContextPath()); out.write("/reports.jsp\" class=\"tab_text"); out.print(tab == 4 ? "_selected" : ""); out.write("\">Reports"); out.write("</a>\n "); out.write("</td>\n "); out.write("<td class=\"tab_seperator\"> "); out.write("</td>\n "); out.write("<td class=\"tab"); out.print(tab == 5 ? "_selected" : ""); out.write("\">\n "); out.write("<a href=\""); out.print(request.getContextPath()); out.write("/setup.jsp\" class=\"tab_text"); out.print(tab == 5 ? "_selected" : ""); out.write("\">Setup"); out.write("</a>\n "); out.write("</td>\n "); out.write("<td class=\"tab_seperator\"> "); out.write("</td>\n "); out.write("<td class=\"tab"); out.print(tab == 6 ? "_selected" : ""); out.write("\">\n "); out.write("<a href=\""); out.print(request.getContextPath()); out.write("/help.jsp\" class=\"tab_text"); out.print(tab == 6 ? "_selected" : ""); out.write("\">Help"); out.write("</a>\n "); out.write("</td>\n "); out.write("<td width=\"100%\">\n "); out.write("</td>\n "); out.write("</tr>\n "); out.write("</table>\n "); out.write("</td>\n "); out.write("</tr>\n "); out.write("<tr>\n "); out.write("<td bgcolor=\"#0000FF\" height=\"1\" colspan=\"4\">"); out.write("</td>\n "); out.write("</tr>\n "); out.write("</table>\n "); out.write("<!-- MAIN BODY -->\n"); out.write("\n\n"); out.write("<div class=\"margined\">\n "); out.write("<h1>Heritrix online help"); out.write("</h1>\n"); out.write("<p>\n "); out.write("<b>"); out.write("<a href=\""); out.print(request.getContextPath()); out.write("/about.jsp\">About Heritrix"); out.write("</a>"); out.write("</b>"); out.write("</br>\n Includes license and current environment information.\n"); out.write("</p>\n"); out.write("<p>\n "); out.write("<b>"); out.write("<a target=\"_blank\" \n href=\""); out.print(request.getContextPath()); out.write("/docs/articles/user_manual/index.html\">User\n Manual"); out.write("</a>"); out.write("</b>"); out.write( "<br> Covers creating, configuring, launching,\n monitoring and analysing crawl jobs. For all users.\n"); out.write("</p>\n"); out.write("<p>\n "); out.write("<b>"); out.write("<a target=\"_blank\" \n href=\""); out.print(request.getContextPath()); out.write("/docs/articles/developer_manual/index.html\">Developer Manual"); out.write("</a>"); out.write("</b>"); out.write( "<br> Covers how to write add on modules for Heritrix\n and provides in depth coverage of Heritrix's architecture. For\n advanced users.\n"); out.write("</p>\n"); out.write("<p>\n "); out.write("<b>"); out.write("<a target=\"_blank\" \n href=\""); out.print(request.getContextPath()); out.write("/docs/articles/releasenotes/index.html\">Release Notes"); out.write("</a>"); out.write("</b>"); out.write("<br>\n"); out.write("</p>\n"); out.write("<p>\n\t"); out.write("<b>"); out.write( "<a href=\"http://crawler.archive.org/issue-tracking.html\" target=\"_blank\">Issue Tracking"); out.write("</a>"); out.write("</b>"); out.write( "<br />\n\tIf you have found a bug or would like to see new features in Heritrix, check the following links:\n\t"); out.write("<ul>\n\t\t"); out.write("<li>"); out.write( "<a href=\"http://sourceforge.net/tracker/?atid=539099&group_id=73833&func=browse\" target=\"_blank\">Bugs"); out.write("</a>"); out.write("</li>\n\t\t"); out.write("<li>"); out.write( "<a href=\"http://sourceforge.net/tracker/?atid=539102&group_id=73833&func=browse\" target=\"_blank\">Feature Requests"); out.write("</a>"); out.write("</li>\n\t"); out.write("</ul>\n"); out.write("</p>\n"); out.write("<p>\n "); out.write("<b>"); out.write( "<a href=\"http://crawler.archive.org/mail-lists.html\" target=\"_blank\">Mailing Lists"); out.write("</a>"); out.write("</b>"); out.write("<br />\n For general discussion on Heritrix, use our "); out.write( "<a href=\"http://groups.yahoo.com/group/archive-crawler/\" target=\"_blank\">Crawler Discussion List"); out.write("</a>.\n"); out.write("</p>\n"); out.write("<p>\n "); out.write("<b>"); out.write("<a href=\""); out.print(request.getContextPath()); out.write("/help/regexpr.jsp\">Regular Expressions"); out.write("</a>"); out.write("</b>"); out.write( "<br />\n Information about the regular expressions used in Heritrix and a tool to double check that your regular expressions are valid and that they correctly identify the desired strings.\n"); out.write("</p>\n"); out.write("<p>\n "); out.write("<b>"); out.write("<a href=\""); out.print(request.getContextPath()); out.write("/help/codes.jsp\">URI Fetch Status Codes"); out.write("</a>"); out.write("</b>"); out.write( "<br />\n This reference details what each of the fetch status codes assigned to URIs means.\n"); out.write("</p>\n"); out.write("<hr />\n"); out.write("<font size=\"-1\">Heritrix version @VERSION@"); out.write("</font>\n"); out.write("</div>\n"); /** * An include file that handles the "look" and navigation of a web page. Wrapps up things * begun in the "head.jsp" include file. See it for more details. * * @author Kristinn Sigurdsson */ out.write("\n"); out.write("<br/>\n"); out.write("<br/>\n "); out.write( "<table border=\"0\" cellspacing=\"0\" cellpadding=\"0\" width=\"100%\">\n "); out.write("<tr>\n "); out.write("<td bgcolor=\"#0000FF\" height=\"1\" colspan=\"4\">"); out.write("</td>\n "); out.write("</tr>\n "); out.write("<tr>\n "); out.write("<td class=\"instance_name\">Identifier: "); out.print(currentHeritrixName); out.write("</td>\n "); out.write("</tr>\n "); out.write("</table>\n "); out.write("<!-- END MAIN BODY -->\n "); out.write("</body>\n"); out.write("</html>"); out.write("\n"); } catch (Throwable t) { out = _jspx_out; if (out != null && out.getBufferSize() != 0) out.clearBuffer(); if (pageContext != null) pageContext.handlePageException(t); } finally { if (_jspxFactory != null) _jspxFactory.releasePageContext(pageContext); } }
public void _jspService(HttpServletRequest request, HttpServletResponse response) throws java.io.IOException, ServletException { JspFactory _jspxFactory = null; javax.servlet.jsp.PageContext pageContext = null; HttpSession session = null; ServletContext application = null; ServletConfig config = null; JspWriter out = null; Object page = this; JspWriter _jspx_out = null; try { _jspxFactory = JspFactory.getDefaultFactory(); response.setContentType("text/html;charset=ISO-8859-1"); pageContext = _jspxFactory.getPageContext(this, request, response, "/error.jsp", true, 8192, true); application = pageContext.getServletContext(); config = pageContext.getServletConfig(); session = pageContext.getSession(); out = pageContext.getOut(); _jspx_out = out; out.write("\n"); out.write("\n"); out.write("\n"); out.write("\n"); out.write("\n"); /** This include page ensures that the handler exists and is ready to be accessed. */ CrawlJobHandler handler = (CrawlJobHandler) application.getAttribute("handler"); Heritrix heritrix = (Heritrix) application.getAttribute("heritrix"); // If handler is empty then this is the first time this bit of code is // being run since the server came online. In that case get or create the // handler. if (handler == null) { if (Heritrix.isSingleInstance()) { heritrix = Heritrix.getSingleInstance(); handler = heritrix.getJobHandler(); application.setAttribute("heritrix", heritrix); application.setAttribute("handler", handler); } else { // TODO: // If we get here, then there are multiple heritrix instances // and we have to put up a screen allowing the user choose between. // Otherwise, there is no Heritrix instance. Thats a problem. throw new RuntimeException( "No heritrix instance (or multiple " + "to choose from and we haven't implemented this yet)"); } } // ensure controller's settingsHandler is always thread-installed // in web ui threads if (handler != null) { CrawlJob job = handler.getCurrentJob(); if (job != null) { CrawlController controller = job.getController(); if (controller != null) { controller.installThreadContextSettingsHandler(); } } } out.write("\n"); out.write("\n"); /** This webpage performs actions that can be performed from the console. */ String sAction = request.getParameter("action"); if (sAction != null) { // Need to handle an action if (sAction.equalsIgnoreCase("start")) { // Tell handler to start crawl job handler.startCrawler(); } else if (sAction.equalsIgnoreCase("stop")) { // Tell handler to stop crawl job handler.stopCrawler(); } else if (sAction.equalsIgnoreCase("terminate")) { // Delete current job if (handler.getCurrentJob() != null) { handler.deleteJob(handler.getCurrentJob().getUID()); } } else if (sAction.equalsIgnoreCase("pause")) { // Tell handler to pause crawl job handler.pauseJob(); } else if (sAction.equalsIgnoreCase("resume")) { // Tell handler to resume crawl job handler.resumeJob(); } else if (sAction.equalsIgnoreCase("checkpoint")) { if (handler.getCurrentJob() != null) { handler.checkpointJob(); } } } response.sendRedirect(request.getContextPath() + "/index.jsp"); out.write("\n"); } catch (Throwable t) { out = _jspx_out; if (out != null && out.getBufferSize() != 0) out.clearBuffer(); if (pageContext != null) pageContext.handlePageException(t); } finally { if (_jspxFactory != null) _jspxFactory.releasePageContext(pageContext); } }