public static void focused() throws FileNotFoundException, UnsupportedEncodingException { while ((!toVisit.isEmpty()) && (keylinks.size() < 1000)) { // startTime = new Date().getTime(); Link node = toVisit.get(0); while (true) { if (node.getDistance() <= 5) try { doc = Jsoup.connect(node.getUrl()).get(); if (checkkeyphrase(doc)) { keylinks.add(node); if (node.getDistance() < 5) { links = doc.select("a[href]"); for (Element link : links) { filter(new Link(link.absUrl("href"), (node.getDistance() + 1))); } } System.out.println( "toVisit: " + toVisit.size() + " Visited: " + visited.size() + " Distance: " + node.getDistance() + " " + " keylinks: " + keylinks.size() + " " + toVisit.get(0).getUrl()); } visited.add(node); break; } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } toVisit.remove(0); // endTime = new Date().getTime(); // execTime = endTime - startTime; // if ( execTime < POLITE ) Thread.sleep( POLITE - execTime ); } PrintWriter writer = new PrintWriter("keylinks.txt", "UTF-8"); System.out.println("Visted: " + visited.size()); System.out.println("toVisit: " + toVisit.size()); System.out.println("keylinks: " + keylinks.size()); for (Link l : keylinks) { writer.println(l.getUrl()); } writer.close(); }
public static void filter(Link link) { // boolean c1, c2, c3, c4, c5; link.setUrl(link.getUrl().split("#")[0]); if (link.getUrl().toLowerCase().contains("https://en.wikipedia.org/wiki/".toLowerCase())) if ((link.getUrl().length() - link.getUrl().replaceAll(":", "").length()) == 1) if (!(link.getUrl() .toLowerCase() .contains("https://en.wikipedia.org/wiki/Main_Page".toLowerCase()))) if (link.getDistance() <= MAX_DEPTH) if ((!toVisit.contains(link)) && (!visited.contains(link))) toVisit.add(link); }
public void postProcessResult(Link result, HttpServletRequest request) { StringBuffer url = new StringBuffer(result.getUrl()); int idx = url.indexOf("/"); LOG.debug("postProcessResult() {} ({})", idx, url); if (idx >= 0) { url.insert(idx, getPrefix(request)); } // if LOG.debug("postProcessResult() {} ({})", idx, url); result.setUrl(url.toString()); } // postProcessResult()
public static void main(String[] args) throws FileNotFoundException, UnsupportedEncodingException, InterruptedException { keyphrase = "concordance"; toVisit.add(SEED); if (keyphrase == null) unfocused(); else { Link node = toVisit.get(0); while (true) { if (node.getDistance() < 5) try { doc = Jsoup.connect(node.getUrl()).get(); links = doc.select("a[href]"); for (Element link : links) { filter(new Link(link.absUrl("href"), (node.getDistance() + 1))); } visited.add(node); if (checkkeyphrase(doc)) { keylinks.add(node); } System.out.println( "toVisit: " + toVisit.size() + " Visited: " + visited.size() + " Distance: " + node.getDistance() + " " + " keylinks: " + keylinks.size() + " " + toVisit.get(0).getUrl()); break; } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } toVisit.remove(0); focused(); } }
public String getFaqLink() { return faq.getUrl(); }
public String getDemoLink() { return demo.getUrl(); }