コード例 #1
0
  public void getPostrResults(Keyword k) {
    try {

      int page = 1;
      boolean sessionEnd = false;

      while (!sessionEnd) {

        String keywordValue = k.getValue().replace(" ", "+");

        String url = "http://postr.hu/?keres=" + keywordValue + "&oldal=" + page++;

        Document doc = getDocumentByUrl(url, 3);

        Elements links = doc.select(".activityflow .comment_text h3 a");
        // Elements bodies = doc.select("#talalatbox p");

        if (links.size() == 0) {
          sessionEnd = true;
          continue;
        }

        SearchSession searchSession = new SearchSession();
        searchSession.setStartDate(Calendar.getInstance().getTime());
        searchSession.setRawData(doc.html());
        searchSession.setSearchText(url);
        searchSessionRepository.create(searchSession);

        for (int i = 0; i < links.size(); i++) {
          try {

            // ha a link tarlamaz #=t, akkor nem kell, mivel comment

            String linkHref = links.get(i).attr("href");
            if (linkHref.contains("#")) {
              continue;
            }

            String linkText = links.get(i).text();

            log.debug(linkHref);

            MessageDigest md = MessageDigest.getInstance("MD5");
            md.update(linkHref.getBytes());

            byte byteData[] = md.digest();

            // convert the byte to hex format method 1
            StringBuffer sb = new StringBuffer();
            for (int j = 0; j < byteData.length; j++) {
              sb.append(Integer.toString((byteData[j] & 0xff) + 0x100, 16).substring(1));
            }

            String sourceId = sb.toString();
            log.debug("sourceId: " + sourceId);

            // ellenorizni kell az adatbazisban
            if (!dataService.isDataWithSourceIdAndKeyword(sourceId, k)) {

              Document bodyDoc = getDocumentByUrl(linkHref, 3);
              // log.debug(bodyDoc.html());
              Elements bodyElements = bodyDoc.getElementsByClass("text");

              if (bodyElements.size() == 0) {
                continue;
              }

              String body = bodyElements.get(0).text();

              log.debug(body);

              boolean resp = dictionaryService.valideText(body, 41);

              if (resp) {

                DateFormat formatter;
                Date originalDate = null;
                formatter = new SimpleDateFormat("yyyymmdd");
                // String[] cut = linkHref.split("/");
                // originalDate = formatter
                // .parse(cut[3] + cut[4] + cut[5]);

                dataService.createData(
                    sourceId, body, linkHref, linkText, "postr", searchSession, originalDate, k);
              }
            }
          } catch (Exception e) {
            log.error("Hiba a data linkek feldolgozasa soran:");
            e.printStackTrace();
          }
        }
        searchSession.setEndDate(Calendar.getInstance().getTime());
        searchSessionRepository.update(searchSession);
      } // sessionEnd
    } catch (Exception e) {
      e.printStackTrace();
    }
  }
コード例 #2
0
  public void getBloghuResults(Keyword k) {
    try {

      int page = 1;
      boolean sessionEnd = false;

      while (!sessionEnd) {

        String keywordValue = k.getValue().replace(" ", "+");

        String url = "http://blog.hu/cimlap/search/?sterm=" + keywordValue + "&page=" + page++;

        Document doc = getDocumentByUrl(url, 3);

        Elements links = doc.select("#talalatbox h1 a");
        Elements bodies = doc.select("#talalatbox p");

        if (links.size() == 0) {
          sessionEnd = true;
          continue;
        }

        SearchSession searchSession = new SearchSession();
        searchSession.setStartDate(Calendar.getInstance().getTime());
        searchSession.setRawData(doc.html());
        searchSession.setSearchText(url);
        searchSessionRepository.create(searchSession);

        for (int i = 0; i < links.size(); i++) {
          String linkHref = links.get(i).attr("href");
          String linkText = links.get(i).text();

          log.debug(linkHref);

          String sourceId = createMd5(linkHref);
          log.debug("sourceId: " + sourceId);

          // ellenorizni kell az adatbazisban
          if (!dataService.isDataWithSourceIdAndKeyword(sourceId, k)) {

            Document bodyDoc = getDocumentByUrl(linkHref, 3);
            // log.debug(bodyDoc.html());
            Element bodyElements = bodyDoc.getElementsByClass("post-content").get(0);

            String body = bodyElements.text();

            log.debug(body);

            boolean resp = dictionaryService.valideText(body, 41);

            if (resp) {

              DateFormat formatter;
              Date originalDate;
              formatter = new SimpleDateFormat("yyyymmdd");
              String[] cut = linkHref.split("/");
              originalDate = formatter.parse(cut[3] + cut[4] + cut[5]);

              dataService.createData(
                  sourceId, body, linkHref, linkText, "bloghu", searchSession, originalDate, k);
            }
          }
        }
        searchSession.setEndDate(Calendar.getInstance().getTime());
        searchSessionRepository.update(searchSession);
      } // sessionEnd
    } catch (Exception e) {
      e.printStackTrace();
    }
  }