@Override public void run() { // TODO Auto-generated method stub final String[] crawlerArgs = { searchwords, htmlpath, txtpath, xmlpath, ippath + "/plainIPs.txt", pagecount }; int ifCrawlFlag = 0; if (ifCrawlFlag == 0) { try { Crawler c = new Crawler(); c.excute(crawlerArgs, dir); } catch (IOException | URISyntaxException | InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } } // System.out.println("now begin to classify the file:"); emoClassifyUnity emo = new emoClassifyUnity(); List<String[]> list2 = emo.emoClassify("./mycraw/" + dir + "/" + searchwords + ".xml"); // save the topic JdbcUtils db = new JdbcUtils(); db.getConnection(); String sql = "insert into topic(topic ,topicdescription,pdate) values (?,?,?)"; String deltopicid = "delete from topic where topic=?"; // List<Object> paramsdel = new ArrayList<Object>(); paramsdel.add(topiclist[0]); try { db.updateBypreparedPreparedStatement(deltopicid, paramsdel); } catch (SQLException e) { // TODO Auto-generated catch block e.printStackTrace(); } // System.out.println(topiclist[0] + "---->>>" + topiclist[1]); List<Object> params = new ArrayList<Object>(); params.add(topiclist[0]); params.add(topiclist[1]); java.sql.Date currentDate = new java.sql.Date(System.currentTimeMillis()); params.add(currentDate); try { db.updateBypreparedPreparedStatement(sql, params); } catch (SQLException e) { // TODO Auto-generated catch block e.printStackTrace(); } db.releasecon(); // savetomysql(gettopicid(searchwords), list2); // String info = topiclist[0] + ":结束下载,分词,结果保存在数据库内"; logwrite.write("./timerlog.txt", info); }
private String prepareResultPagesSection() { String resultsPath; try { resultsPath = rootDirectory.getCanonicalPath() + Crawler.RESULTS_PATH_LOCAL; } catch (IOException e) { System.out.println("HTTPRequest: Error root directory" + rootDirectory.toString()); return ""; } StringBuilder result = new StringBuilder(); result.append("<div class=\"connectedDomains\"><ul>"); File resultsFolder = new File(resultsPath); if (resultsFolder.exists() && resultsFolder.isDirectory()) { File[] allFiles = resultsFolder.listFiles(); SimpleDateFormat format = new SimpleDateFormat("dd.MM.yyyy-HH:mm"); for (File file : allFiles) { String filename = file.getName(); String domain = Crawler.ResultsFilenameToDomain(filename); Date creationDate = Crawler.ResultsFilenameToDate(filename); String linkFormat = domain + "-" + format.format(creationDate); result.append("<li><a href="); result.append(Crawler.RESULTS_PATH_WEB); result.append(filename); result.append(">"); result.append(linkFormat); result.append("</a></li>"); } result.append("</ul></div>"); } return result.toString(); }
/** * Attempts to search the mod website for the mod and pull the recent versions of the mod. * * @param mod The Mod to search for on the website. * @param modInfoList The JList to populate/alter. */ public void getRecentVersionsOfModAsync(Profile.Mod mod, JList modInfoList) { // Here we set a thread task to get the version numbers for the mod. This will look at the site // and search for the mod, then pull all versions from it. Runnable task = () -> Crawler.readVersionInfoOfMod(mod.nameWithoutVersion); Thread thread = new Thread(task); thread.start(); // Our timer that checks every 200ms if the thread has finished. Timer timer = new Timer(200, null); timer.addActionListener( ev -> { if (thread.getState() != Thread.State.TERMINATED) timer.restart(); else { timer.stop(); DefaultListModel listModel = (DefaultListModel) modInfoList.getModel(); // Get the modVersionInfo from the crawler. If not null, add to the list. String[][] modVersionInfo = Crawler.getModVersionInfo(); if (modVersionInfo != null) { listModel.addElement("Recent Versions:"); for (String[] info : modVersionInfo) { listModel.addElement(" v" + info[0] + " for " + info[1]); } } else { listModel.addElement("Couldn't find the mod on the website."); } modInfoList.setModel(listModel); } }); timer.start(); }
public void importArchive(int day) { try { Calendar processStartedAt = Calendar.getInstance(); Calendar startDate = Calendar.getInstance(); startDate.add(Calendar.DAY_OF_YEAR, -day); Calendar endDate = Calendar.getInstance(); // List<Crawler> crawlers = new ArrayList<>(); ScraperConfiguration config = Crawler.getScraperConfig(context, null, Crawler.CrawlType.DATA_ARCHIVE); Map params = new HashMap(); dateFormat = new SimpleDateFormat(DSE_DATA_ARCHIVE_DATE_FORMAT); params.put("startDate", dateFormat.format(startDate.getTime())); params.put("endDate", dateFormat.format(endDate.getTime())); Item item = new Item(ALL_INSTRUMENT); Crawler crawler = new Crawler(config, item, Crawler.CrawlType.DATA_ARCHIVE, params); crawler.start(); // crawlers.add(crawler); int counter = 0; crawler.join(); List<Item> items = (List<Item>) crawler.getParams().get("items"); List<Item> dsexItems = importDSEXArchive(day); System.out.println("dsex items: " + dsexItems.size()); items.addAll(dsexItems); // filterOutUnneccessaryCodes(items, watchMatrix); if (items.size() > 0) { dao.open(); dao.importItems(items); dao.close(); // System.out.println("[" + (++counter) + "]Import data archive finished for " + // items.get(0).getCode()); } Calendar processEndedAt = Calendar.getInstance(); long elapsedTime = (processEndedAt.getTimeInMillis() - processStartedAt.getTimeInMillis()) / 1000; System.out.println( "Time elapsed to sync " + day + " day archive for " + items.size() + " item: " + (elapsedTime / 60) + " minutes " + (elapsedTime % 60) + " seconds"); } catch (IOException | InterruptedException | SQLException | ClassNotFoundException ex) { Logger.getLogger(ImportService.class.getName()).log(Level.SEVERE, null, ex); ex.printStackTrace(); } }
private byte[] prepareDefaultPage(String message) { String head = "<!doctype html><html lang=\"en\"><head><title> Crawler HTML site </title></head>" + "<link href=\"css/style.css\" rel=\"stylesheet\" /><body><div class=\"header\"><h1>Crawler</h1></div>"; String form; if (message != null) { form = "<div class=\"crawlerAnswer\"><h2>" + message + "</h2>" + "<a href=\"/\"><h3>Back to homepage</h3></a></div>"; } else if (serverCrawler.isBusy()) { form = "<div class=\"crawlerAnswer\"><h2>Crawler is already running</h2></div>"; } else { form = "<div class=\"crawlerForm\"><form id=\"generalform\" method=\"post\" action=\"execResult.html\" class=\"crawlerFormTable\">" + "<table><tr><td><h3>Enter Domain</h3></td><td><input type=\"text\" name=\"Domain\"></td></tr><tr>" + "<td><h3><input type=\"checkbox\" name=\"portscan\">Perform full TCP port scan</h3></td></tr><tr>" + "<td><h3><input type=\"checkbox\" name=\"robots.txt\">Disrespect robots.txt</h3></td></tr>" + "<tr><td></td><td><input type=\"submit\" value=\"Start Crawler\"></td></tr></table></form></div>"; } String resultPages = prepareResultPagesSection(); String finish = "</body></html>"; String result = head + form + resultPages + finish; return result.getBytes(); }
private byte[] activateCrawler(HtmlRequest htmlRequest) throws IOException { byte[] bodyInBytes; String domain = htmlRequest.parametersInRequestBody.get("Domain"); boolean ignoreRobots = false; boolean performPortScan = false; if (htmlRequest.parametersInRequestBody.containsKey("portscan")) { performPortScan = true; } if (htmlRequest.parametersInRequestBody.containsKey("robots.txt")) { ignoreRobots = true; } boolean isConfigureSucceeded = serverCrawler.ConfigureCrawler(domain, ignoreRobots, performPortScan); if (isConfigureSucceeded) { bodyInBytes = prepareDefaultPage("Crawler started succesfuly"); System.out.println("Domain is: " + domain); System.out.println("Perform port scan: " + performPortScan); System.out.println("Ignore robots.txt: " + ignoreRobots); } else { bodyInBytes = prepareDefaultPage("Crawler is already running"); } return bodyInBytes; }
public void importArchive(String code, int day) { if ("DSEX".equals(code)) { importDSEXArchiveOnly(day); return; } try { if (day < 1) { day = 7; } Calendar startDate = Calendar.getInstance(); startDate.add(Calendar.DAY_OF_YEAR, -day); Calendar endDate = Calendar.getInstance(); Item item = new Item(); item.setCode(code); List<Crawler> crawlers = new ArrayList<>(); ScraperConfiguration config = Crawler.getScraperConfig(context, null, Crawler.CrawlType.DATA_ARCHIVE); Map params = new HashMap(); dateFormat = new SimpleDateFormat(DSE_DATA_ARCHIVE_DATE_FORMAT); params.put("startDate", dateFormat.format(startDate.getTime())); params.put("endDate", dateFormat.format(endDate.getTime())); Crawler crawler = new Crawler(config, item, Crawler.CrawlType.DATA_ARCHIVE, params); crawler.start(); crawlers.add(crawler); for (Crawler craw : crawlers) { craw.join(); } List<Item> items = (List<Item>) params.get("items"); dao.open(); dao.importItems(items); dao.close(); } catch (FileNotFoundException | InterruptedException | SQLException | ClassNotFoundException ex) { Logger.getLogger(ImportService.class.getName()).log(Level.SEVERE, null, ex); ex.printStackTrace(); } }
public List<Item> importDSEXArchive(int day) { List<Item> items = new ArrayList<>(); try { Calendar processStartedAt = Calendar.getInstance(); Calendar startDate = Calendar.getInstance(); startDate.add(Calendar.DAY_OF_YEAR, -day); Calendar endDate = Calendar.getInstance(); String path = Utils.getConfigFilesPath(); ScraperConfiguration config = Crawler.getScraperConfig(null, path, Crawler.CrawlType.DSEX_DATA_ARCHIVE); Map params = new HashMap(); dateFormat = new SimpleDateFormat(DSE_DATA_ARCHIVE_DATE_FORMAT); params.put("startDate", dateFormat.format(startDate.getTime())); params.put("endDate", dateFormat.format(endDate.getTime())); Item item = new Item(); Crawler crawler = new Crawler(config, item, Crawler.CrawlType.DSEX_DATA_ARCHIVE, params); crawler.start(); crawler.join(); items = (List<Item>) crawler.getParams().get("items"); Calendar processEndedAt = Calendar.getInstance(); long elapsedTime = (processEndedAt.getTimeInMillis() - processStartedAt.getTimeInMillis()) / 1000; System.out.println( "Time elapsed to sync " + day + " day archive for " + items.size() + " item: " + (elapsedTime / 60) + " minutes " + (elapsedTime % 60) + " seconds"); } catch (IOException | InterruptedException ex) { Logger.getLogger(ImportService.class.getName()).log(Level.SEVERE, null, ex); ex.printStackTrace(); } return items; }
private Hashtable<String, Integer> extractData(String docUrl, Page page) throws Exception { Hashtable<String, Integer> businessTable = new Hashtable<String, Integer>(); String currentBusiness = ""; Lexer lexer = new Lexer(page); while (true) { Node node = lexer.nextNode(); if (node == null) { break; } if (node instanceof TagNode) { TagNode tagNode = (TagNode) node; if (tagNode.getTagName().equals("A")) { String href = tagNode.getAttribute("href"); if (href != null) { String absUrl = AbsUrlConstructor.construct(docUrl, href); Crawler.dispatchUrl(absUrl); Pattern pBusiness = Pattern.compile("^(http://www.yelp.com/biz/)(\\S)+"); if (pBusiness.matcher(absUrl).matches()) { currentBusiness = extractBusinessName(href); if (!businessTable.containsKey(currentBusiness)) { businessTable.put(currentBusiness, -1); } // System.out.println("currentBusiness = "+currentBusiness); // rating = "4"; // UpdateDatabase(linkID, business, rating, userID); // System.out.println(business + " added."); } } } else if (tagNode.getTagName().equals("IMG")) { String c1ass2 = tagNode.getAttribute("class"); if (c1ass2 != null) { String rating = ""; String[] rate = c1ass2.split("_"); int num = rate.length - 1; if (!rate[num].equals("loader")) { rating = rate[num].trim(); if (businessTable.get(currentBusiness) == -1) { businessTable.put(currentBusiness, Integer.parseInt(rating)); } } // System.out.println(linkID + " " + business + " " + rating + " " + userID; } } } } return businessTable; }
/** Pulls the recently updated mods from the mod website. */ public void downloadRecentModList() { Runnable task = () -> Crawler.processFilter("http://www.factoriomods.com/recently-updated", 1); Thread thread = new Thread(task); thread.start(); final javax.swing.Timer timer = new javax.swing.Timer(100, null); timer.addActionListener( ev -> { if (thread.getState() != Thread.State.TERMINATED) timer.restart(); else { System.out.println("Done with page"); timer.stop(); ArrayList<String[]> modInfoListFromBrowser = Crawler.getSpecificFilterModsInfo(); DefaultTableModel model = new DefaultTableModel(new String[] {"", "", "", ""}, 0); for (String[] info : modInfoListFromBrowser) model.addRow(info); this.window.setModBrowserTableModel(model); } }); timer.start(); }
public void indexBook(Book book) throws BookException, IOException { IndexManager inMan = new IndexManager(); inMan.deleteBook(book.getName()); CrawlLinkListener list = new CrawlLinkListener(book, inMan); DownloadParameters dlPars = new DownloadParameters(); dlPars = dlPars.changeMaxThreads(0); dlPars = dlPars.changeMaxPageSize(-1); Crawler c = new Crawler(); c.setRoot(new Link(book.getLocation())); c.addClassifier(new StandardClassifier()); c.setDownloadParameters(dlPars); c.addLinkListener(list); c.setDomain(Crawler.SERVER); c.setLinkType(Crawler.HYPERLINKS); c.setMaxDepth(15); c.run(); }
private HtmlResponse respond200(HtmlRequest htmlRequest) throws IOException, InterruptedException { HtmlResponse response200 = new HtmlResponse(); byte[] bodyInBytes; if (htmlRequest.type.equals("TRACE")) { bodyInBytes = htmlRequest.unparsedRequest.getBytes(); } else if (htmlRequest.type.equals("POST")) { if (htmlRequest.requestedFile.equals("/params_info.html")) { bodyInBytes = makeTable(htmlRequest.parametersInRequestBody); } else if (htmlRequest.requestedFile.equals(execResults)) { System.out.println( "Parameters for Crawler : " + htmlRequest.parametersInRequestBody.toString()); if (serverCrawler.isBusy()) { bodyInBytes = prepareDefaultPage("Crawler is busy"); } else { String crawlerInputCheckResults = checkCrawlerInput(htmlRequest); if (crawlerInputCheckResults == null) { bodyInBytes = activateCrawler(htmlRequest); Thread crawlerThread = new Thread(serverCrawler); crawlerThread.start(); } else { bodyInBytes = prepareDefaultPage(crawlerInputCheckResults); } } } else { bodyInBytes = null; } } else { bodyInBytes = readFileForResponse(htmlRequest); } response200.setEntityBody(bodyInBytes); response200.setStatus(htmlRequest.httpVersion, 200); String contentType; if (!htmlRequest.type.equals("POST")) { contentType = getContentTypeFromFile(htmlRequest.requestedFile); } else { contentType = getContentTypeFromFile(htmlRequest.requestedFile); } response200.setContentTypeLine(contentType); return response200; }
/** * Downloads a version of a mod. * * @param modVersion The version text for the mod. * @param selectedIndex The selected index from the UI. */ public void downloadVersionOfMod(String modVersion, int selectedIndex) { // Trim whitespace and check if we have a 'v0.1' or some match. if (modVersion.trim().matches("v\\d+.*")) { selectedIndex = selectedIndex - 4; // 4 is our magic number. We have 4 lines before this one so we subtract 4 to get // the adjusted index. String[][] info = Crawler.getModVersionInfo(); String path = this.model.getFactorioModManagerPath() + this.model.getCurrentlySelectedProfile() + "/" + this.model.getCurrentlySelectedModName() + "_" + info[selectedIndex][Crawler.ModVersionInfo.VERSION.getValue()] + ".zip"; // TODO Who knows, this doesn't work. Downloads empty file. // Create a new file and try to download into it. File file = new File(path); try { // URL url = new URL(info[selectedIndex][Crawler.ModVersionInfo.DOWNLOAD.getValue()]); // HttpURLConnection con = (HttpURLConnection)url.openConnection(); // InputStream stream = con.getInputStream(); // Files.copy(stream, Paths.get(path)); org.apache.commons.io.FileUtils.copyURLToFile( new URL(info[selectedIndex][Crawler.ModVersionInfo.DOWNLOAD.getValue()]), file, 2000, 2000); } catch (IOException e) { e.printStackTrace(); } System.out.println( "Index: " + info[selectedIndex][Crawler.ModVersionInfo.DOWNLOAD.getValue()]); } }
private String checkCrawlerInput(HtmlRequest htmlRequest) { String result = null; String domain = htmlRequest.parametersInRequestBody.get("Domain"); String domainFound = Crawler.ParseURL(domain); if (domainFound.charAt(domainFound.length() - 1) == '\\') { domainFound = domainFound.substring(0, domainFound.length() - 1); } try { ClientRequest clientRequest = new ClientRequest(domainFound, ClientRequest.getRequest); if (clientRequest.responseHeaderFields == null) { return "Error connecting to: " + domain + "\n"; } } catch (Exception e) { System.out.println("checkCrawlerInput: clientRequest generated error."); result = "Error connecting to: " + domain + "\n" + e.toString(); e.printStackTrace(); } return result; }
protected void crawlContent(SimulatedArchivalUnit sau) { log.debug("crawlContent()"); CrawlSpec spec = new SpiderCrawlSpec(sau.getNewContentCrawlUrls(), null); Crawler crawler = new NoCrawlEndActionsNewContentCrawler(sau, spec, new MockAuState()); crawler.doCrawl(); }
public void test01() { Crawler crawler = new Crawler(); Card card = crawler.crawlCard("Civilized Scholar"); assertNotNull(card); }
public void importAlphabeticArchive(char alphabet, int day) { SyncService syncService = new SyncService(context); try { Calendar processStartedAt = Calendar.getInstance(); List<Item> watchMatrix = syncService.getCodes(); Calendar startDate = Calendar.getInstance(); startDate.add(Calendar.DAY_OF_YEAR, -day); Calendar endDate = Calendar.getInstance(); List<Crawler> crawlers = new ArrayList<>(); for (Item item : watchMatrix) { if (!item.getCode().startsWith(alphabet + "")) { continue; } ScraperConfiguration config = Crawler.getScraperConfig(context, null, Crawler.CrawlType.DATA_ARCHIVE); Map params = new HashMap(); dateFormat = new SimpleDateFormat(DSE_DATA_ARCHIVE_DATE_FORMAT); params.put("startDate", dateFormat.format(startDate.getTime())); params.put("endDate", dateFormat.format(endDate.getTime())); Crawler crawler = new Crawler(config, item, Crawler.CrawlType.DATA_ARCHIVE, params); crawler.start(); crawlers.add(crawler); } int counter = 0; for (Crawler craw : crawlers) { craw.join(); List<Item> items = (List<Item>) craw.getParams().get("items"); if (items.size() == 0) { System.out.println("Could not update for item: " + craw.getItem().getCode()); continue; } dao.open(); dao.importItems(items); dao.close(); System.out.println( "[" + (++counter) + "]Import data archive finished for " + items.get(0).getCode()); } Calendar processEndedAt = Calendar.getInstance(); long elapsedTime = (processEndedAt.getTimeInMillis() - processStartedAt.getTimeInMillis()) / 1000; System.out.println( "Time elapsed to sync " + day + " day archive for " + crawlers.size() + " item: " + (elapsedTime / 60) + " minutes " + (elapsedTime % 60) + " seconds"); } catch (IOException | InterruptedException | SQLException | ClassNotFoundException ex) { Logger.getLogger(ImportService.class.getName()).log(Level.SEVERE, null, ex); ex.printStackTrace(); } }