public void getPostrResults(Keyword k) { try { int page = 1; boolean sessionEnd = false; while (!sessionEnd) { String keywordValue = k.getValue().replace(" ", "+"); String url = "http://postr.hu/?keres=" + keywordValue + "&oldal=" + page++; Document doc = getDocumentByUrl(url, 3); Elements links = doc.select(".activityflow .comment_text h3 a"); // Elements bodies = doc.select("#talalatbox p"); if (links.size() == 0) { sessionEnd = true; continue; } SearchSession searchSession = new SearchSession(); searchSession.setStartDate(Calendar.getInstance().getTime()); searchSession.setRawData(doc.html()); searchSession.setSearchText(url); searchSessionRepository.create(searchSession); for (int i = 0; i < links.size(); i++) { try { // ha a link tarlamaz #=t, akkor nem kell, mivel comment String linkHref = links.get(i).attr("href"); if (linkHref.contains("#")) { continue; } String linkText = links.get(i).text(); log.debug(linkHref); MessageDigest md = MessageDigest.getInstance("MD5"); md.update(linkHref.getBytes()); byte byteData[] = md.digest(); // convert the byte to hex format method 1 StringBuffer sb = new StringBuffer(); for (int j = 0; j < byteData.length; j++) { sb.append(Integer.toString((byteData[j] & 0xff) + 0x100, 16).substring(1)); } String sourceId = sb.toString(); log.debug("sourceId: " + sourceId); // ellenorizni kell az adatbazisban if (!dataService.isDataWithSourceIdAndKeyword(sourceId, k)) { Document bodyDoc = getDocumentByUrl(linkHref, 3); // log.debug(bodyDoc.html()); Elements bodyElements = bodyDoc.getElementsByClass("text"); if (bodyElements.size() == 0) { continue; } String body = bodyElements.get(0).text(); log.debug(body); boolean resp = dictionaryService.valideText(body, 41); if (resp) { DateFormat formatter; Date originalDate = null; formatter = new SimpleDateFormat("yyyymmdd"); // String[] cut = linkHref.split("/"); // originalDate = formatter // .parse(cut[3] + cut[4] + cut[5]); dataService.createData( sourceId, body, linkHref, linkText, "postr", searchSession, originalDate, k); } } } catch (Exception e) { log.error("Hiba a data linkek feldolgozasa soran:"); e.printStackTrace(); } } searchSession.setEndDate(Calendar.getInstance().getTime()); searchSessionRepository.update(searchSession); } // sessionEnd } catch (Exception e) { e.printStackTrace(); } }
public void getBloghuResults(Keyword k) { try { int page = 1; boolean sessionEnd = false; while (!sessionEnd) { String keywordValue = k.getValue().replace(" ", "+"); String url = "http://blog.hu/cimlap/search/?sterm=" + keywordValue + "&page=" + page++; Document doc = getDocumentByUrl(url, 3); Elements links = doc.select("#talalatbox h1 a"); Elements bodies = doc.select("#talalatbox p"); if (links.size() == 0) { sessionEnd = true; continue; } SearchSession searchSession = new SearchSession(); searchSession.setStartDate(Calendar.getInstance().getTime()); searchSession.setRawData(doc.html()); searchSession.setSearchText(url); searchSessionRepository.create(searchSession); for (int i = 0; i < links.size(); i++) { String linkHref = links.get(i).attr("href"); String linkText = links.get(i).text(); log.debug(linkHref); String sourceId = createMd5(linkHref); log.debug("sourceId: " + sourceId); // ellenorizni kell az adatbazisban if (!dataService.isDataWithSourceIdAndKeyword(sourceId, k)) { Document bodyDoc = getDocumentByUrl(linkHref, 3); // log.debug(bodyDoc.html()); Element bodyElements = bodyDoc.getElementsByClass("post-content").get(0); String body = bodyElements.text(); log.debug(body); boolean resp = dictionaryService.valideText(body, 41); if (resp) { DateFormat formatter; Date originalDate; formatter = new SimpleDateFormat("yyyymmdd"); String[] cut = linkHref.split("/"); originalDate = formatter.parse(cut[3] + cut[4] + cut[5]); dataService.createData( sourceId, body, linkHref, linkText, "bloghu", searchSession, originalDate, k); } } } searchSession.setEndDate(Calendar.getInstance().getTime()); searchSessionRepository.update(searchSession); } // sessionEnd } catch (Exception e) { e.printStackTrace(); } }