private String prepareResultPagesSection() { String resultsPath; try { resultsPath = rootDirectory.getCanonicalPath() + Crawler.RESULTS_PATH_LOCAL; } catch (IOException e) { System.out.println("HTTPRequest: Error root directory" + rootDirectory.toString()); return ""; } StringBuilder result = new StringBuilder(); result.append("<div class=\"connectedDomains\"><ul>"); File resultsFolder = new File(resultsPath); if (resultsFolder.exists() && resultsFolder.isDirectory()) { File[] allFiles = resultsFolder.listFiles(); SimpleDateFormat format = new SimpleDateFormat("dd.MM.yyyy-HH:mm"); for (File file : allFiles) { String filename = file.getName(); String domain = Crawler.ResultsFilenameToDomain(filename); Date creationDate = Crawler.ResultsFilenameToDate(filename); String linkFormat = domain + "-" + format.format(creationDate); result.append("<li><a href="); result.append(Crawler.RESULTS_PATH_WEB); result.append(filename); result.append(">"); result.append(linkFormat); result.append("</a></li>"); } result.append("</ul></div>"); } return result.toString(); }
private byte[] prepareDefaultPage(String message) { String head = "<!doctype html><html lang=\"en\"><head><title> Crawler HTML site </title></head>" + "<link href=\"css/style.css\" rel=\"stylesheet\" /><body><div class=\"header\"><h1>Crawler</h1></div>"; String form; if (message != null) { form = "<div class=\"crawlerAnswer\"><h2>" + message + "</h2>" + "<a href=\"/\"><h3>Back to homepage</h3></a></div>"; } else if (serverCrawler.isBusy()) { form = "<div class=\"crawlerAnswer\"><h2>Crawler is already running</h2></div>"; } else { form = "<div class=\"crawlerForm\"><form id=\"generalform\" method=\"post\" action=\"execResult.html\" class=\"crawlerFormTable\">" + "<table><tr><td><h3>Enter Domain</h3></td><td><input type=\"text\" name=\"Domain\"></td></tr><tr>" + "<td><h3><input type=\"checkbox\" name=\"portscan\">Perform full TCP port scan</h3></td></tr><tr>" + "<td><h3><input type=\"checkbox\" name=\"robots.txt\">Disrespect robots.txt</h3></td></tr>" + "<tr><td></td><td><input type=\"submit\" value=\"Start Crawler\"></td></tr></table></form></div>"; } String resultPages = prepareResultPagesSection(); String finish = "</body></html>"; String result = head + form + resultPages + finish; return result.getBytes(); }
private byte[] activateCrawler(HtmlRequest htmlRequest) throws IOException { byte[] bodyInBytes; String domain = htmlRequest.parametersInRequestBody.get("Domain"); boolean ignoreRobots = false; boolean performPortScan = false; if (htmlRequest.parametersInRequestBody.containsKey("portscan")) { performPortScan = true; } if (htmlRequest.parametersInRequestBody.containsKey("robots.txt")) { ignoreRobots = true; } boolean isConfigureSucceeded = serverCrawler.ConfigureCrawler(domain, ignoreRobots, performPortScan); if (isConfigureSucceeded) { bodyInBytes = prepareDefaultPage("Crawler started succesfuly"); System.out.println("Domain is: " + domain); System.out.println("Perform port scan: " + performPortScan); System.out.println("Ignore robots.txt: " + ignoreRobots); } else { bodyInBytes = prepareDefaultPage("Crawler is already running"); } return bodyInBytes; }
private HtmlResponse respond200(HtmlRequest htmlRequest) throws IOException, InterruptedException { HtmlResponse response200 = new HtmlResponse(); byte[] bodyInBytes; if (htmlRequest.type.equals("TRACE")) { bodyInBytes = htmlRequest.unparsedRequest.getBytes(); } else if (htmlRequest.type.equals("POST")) { if (htmlRequest.requestedFile.equals("/params_info.html")) { bodyInBytes = makeTable(htmlRequest.parametersInRequestBody); } else if (htmlRequest.requestedFile.equals(execResults)) { System.out.println( "Parameters for Crawler : " + htmlRequest.parametersInRequestBody.toString()); if (serverCrawler.isBusy()) { bodyInBytes = prepareDefaultPage("Crawler is busy"); } else { String crawlerInputCheckResults = checkCrawlerInput(htmlRequest); if (crawlerInputCheckResults == null) { bodyInBytes = activateCrawler(htmlRequest); Thread crawlerThread = new Thread(serverCrawler); crawlerThread.start(); } else { bodyInBytes = prepareDefaultPage(crawlerInputCheckResults); } } } else { bodyInBytes = null; } } else { bodyInBytes = readFileForResponse(htmlRequest); } response200.setEntityBody(bodyInBytes); response200.setStatus(htmlRequest.httpVersion, 200); String contentType; if (!htmlRequest.type.equals("POST")) { contentType = getContentTypeFromFile(htmlRequest.requestedFile); } else { contentType = getContentTypeFromFile(htmlRequest.requestedFile); } response200.setContentTypeLine(contentType); return response200; }
private String checkCrawlerInput(HtmlRequest htmlRequest) { String result = null; String domain = htmlRequest.parametersInRequestBody.get("Domain"); String domainFound = Crawler.ParseURL(domain); if (domainFound.charAt(domainFound.length() - 1) == '\\') { domainFound = domainFound.substring(0, domainFound.length() - 1); } try { ClientRequest clientRequest = new ClientRequest(domainFound, ClientRequest.getRequest); if (clientRequest.responseHeaderFields == null) { return "Error connecting to: " + domain + "\n"; } } catch (Exception e) { System.out.println("checkCrawlerInput: clientRequest generated error."); result = "Error connecting to: " + domain + "\n" + e.toString(); e.printStackTrace(); } return result; }
protected void crawlContent(SimulatedArchivalUnit sau) { log.debug("crawlContent()"); CrawlSpec spec = new SpiderCrawlSpec(sau.getNewContentCrawlUrls(), null); Crawler crawler = new NoCrawlEndActionsNewContentCrawler(sau, spec, new MockAuState()); crawler.doCrawl(); }