コード例 #1
0
ファイル: Parser.java プロジェクト: chenyang/Project_Euronext
  public void run() {

    if (infosMotCles == null && infosSociete == null && maBD == null) return;

    String pageCourante = url.substring(infosSociete.get("url").toString().length(), url.length());

    try {
      // Si on a pas vu la page -> On fouille la page
      if (maBD.estDejaVu(infosSociete.get("url").toString(), pageCourante).equalsIgnoreCase("0")) {
        logger.log(Level.INFO, "Parsing - {0}", url);
        Document doc = Jsoup.connect(url).get();
        // Si c'est une page 404 on skip
        if (doc.title().indexOf("404") != -1) return;

        parser(doc, pageCourante);

        if (infosSociete.get("url").toString().equalsIgnoreCase(url)) {

          // On regarde si il y a des page en plus
          for (String link : getLinks(doc)) {
            pageCourante =
                link.substring(infosSociete.get("url").toString().length(), link.length());
            if (maBD.estDejaVu(infosSociete.get("url").toString(), pageCourante)
                .equalsIgnoreCase("0")) {
              // On lance un nouveau thread pour cette page
              // System.out.println("!"+pageCourante+" a plusieurs pages");
              new Thread(new Parser(infosSociete, infosMotCles, maBD, link, logger)).start();

              try {
                Thread.sleep(3500); // On wait 3.5 sec
              } catch (Exception e) {
              }
            }
          }
        }
      }
    } catch (Exception ex) {
      logger.log(Level.WARNING, "Parsing error - {0}", url + " - " + ex.toString());
    } finally {
      return;
    }
  }
コード例 #2
0
ファイル: Parser.java プロジェクト: chenyang/Project_Euronext
  private void parser(Document doc, String pageCourante) throws Exception {
    String pos = "";

    // maBD.insertPage(infosSociete.get("url").toString(), pageCourante); //On insère la page
    // courante

    for (Map unMot : infosMotCles) {
      pos = "autre";

      // Inspection de l'entete
      String keywords = doc.getElementsByAttributeValue("name", "keywords").attr("content");
      String description = doc.getElementsByAttributeValue("name", "description").attr("content");

      if (keywords.indexOf(unMot.get("libelleMotCle").toString().toLowerCase().trim()) != -1) {
        pos = "keywords";
        maBD.insert(
            infosSociete.get("url").toString(),
            pageCourante,
            unMot.get("idMotCle").toString(),
            pos,
            keywords);
        // ************* TROUVE KEYWORD ***************
        // System.out.println("keyword "+description);
      }

      if (description.indexOf(unMot.get("libelleMotCle").toString().toLowerCase().trim()) != -1) {
        pos = "description";
        maBD.insert(
            infosSociete.get("url").toString(),
            pageCourante,
            unMot.get("idMotCle").toString(),
            pos,
            description);
        // ************* TROUVE DESCRIPTION ***************
        // System.out.println("description "+description);
      }

      Elements all =
          doc.getElementsContainingOwnText(
              unMot.get("libelleMotCle").toString().toLowerCase().trim());

      if (!all.isEmpty()) {
        for (Element elt : all) {

          if (elt.tagName().equalsIgnoreCase("h1")) {
            pos =
                "h1"; // pos here indicates where the element appears. e.g., here the idMot appears
            // between tag <h1></h1>
            String ownText = elt.ownText(); // The text where the motcle appears
            // maBD.insert(infosSociete.get("url").toString(), pageCourante,
            // unMot.get("idMotCle").toString(),pos);
            maBD.insert(
                infosSociete.get("url").toString(),
                pageCourante,
                unMot.get("idMotCle").toString(),
                pos,
                ownText);
            // ************* TROUVE H1 ***************
            // System.out.println("<h1> "+elt.ownText());
          } else if (elt.tagName().equalsIgnoreCase("h2")) {
            pos = "h2";
            String ownText = elt.ownText(); // The text where the motcle appears
            maBD.insert(
                infosSociete.get("url").toString(),
                pageCourante,
                unMot.get("idMotCle").toString(),
                pos,
                ownText);
            // ************* TROUVE H2 ***************
            // System.out.println("<h2> "+elt.ownText());
          } else if (elt.tagName().equalsIgnoreCase("h3")) {
            pos = "h3";
            String ownText = elt.ownText(); // The text where the motcle appears
            maBD.insert(
                infosSociete.get("url").toString(),
                pageCourante,
                unMot.get("idMotCle").toString(),
                pos,
                ownText);
            // ************* TROUVE H3 ***************
            // System.out.println("<h3> "+elt.ownText());
          } else if (elt.tagName().equalsIgnoreCase("h4")) {
            pos = "h4";
            String ownText = elt.ownText(); // The text where the motcle appears
            maBD.insert(
                infosSociete.get("url").toString(),
                pageCourante,
                unMot.get("idMotCle").toString(),
                pos,
                ownText);
            // ************* TROUVE H4 ***************
            // System.out.println("<h4> "+elt.ownText());
          } else if (elt.tagName().equalsIgnoreCase("li")) {
            pos = "li";
            String ownText = elt.ownText(); // The text where the motcle appears
            maBD.insert(
                infosSociete.get("url").toString(),
                pageCourante,
                unMot.get("idMotCle").toString(),
                pos,
                ownText);
            // ************* TROUVE MENU ***************
            // System.out.println("<li> "+elt.ownText());
          } else if (elt.tagName().equalsIgnoreCase("div")) {
            pos = "div";
            String ownText = elt.ownText(); // The text where the motcle appears
            maBD.insert(
                infosSociete.get("url").toString(),
                pageCourante,
                unMot.get("idMotCle").toString(),
                pos,
                ownText);
            // ************* TROUVE DIV ***************
            // System.out.println("<div> "+elt.ownText());
          } else if (elt.tagName().equalsIgnoreCase("span")) {
            pos = "span";
            String ownText = elt.ownText(); // The text where the motcle appears
            maBD.insert(
                infosSociete.get("url").toString(),
                pageCourante,
                unMot.get("idMotCle").toString(),
                pos,
                ownText);
            // ************* TROUVE SPAN ***************
            // System.out.println("<span> "+elt.ownText());
          } else if (elt.tagName().equalsIgnoreCase("p")) {
            pos = "p";
            String ownText = elt.ownText(); // The text where the motcle appears
            maBD.insert(
                infosSociete.get("url").toString(),
                pageCourante,
                unMot.get("idMotCle").toString(),
                pos,
                ownText);
            // ************* TROUVE PARAGRAPHE ***************
            // System.out.println("<p> "+elt.ownText());
          } else if (elt.tagName().equalsIgnoreCase("a")) {
            pos = "a";
            String ownText = elt.ownText(); // The text where the motcle appears
            maBD.insert(
                infosSociete.get("url").toString(),
                pageCourante,
                unMot.get("idMotCle").toString(),
                pos,
                ownText);
            // ************* TROUVE LIEN ***************
            // System.out.println("<a> "+elt.ownText());
          } else {
            pos = "autre";
            String ownText = elt.ownText(); // The text where the motcle appears
            maBD.insert(
                infosSociete.get("url").toString(),
                pageCourante,
                unMot.get("idMotCle").toString(),
                pos,
                ownText);
            // ************* TROUVE AUTRE ***************
            // System.out.println("autre "+elt.tagName()+" - "+elt.ownText());
          }
        }
      }

      // On check les images
      Elements img = doc.getElementsByTag("img");
      for (Element elt : img) {
        if (elt.attr("alt").indexOf(unMot.get("libelleMotCle").toString().toLowerCase().trim())
            != -1) {
          pos = "img";
          String ownText = elt.ownText(); // The text where the motcle appears
          maBD.insert(
              infosSociete.get("url").toString(),
              pageCourante,
              unMot.get("idMotCle").toString(),
              pos,
              ownText);
          // ************* TROUVE IMAGE ***************
          // System.out.println("image "+elt.attr("alt").toString());

        }
      }

      // On check les titles liens
      Elements links = doc.getElementsByTag("a");
      for (Element elt : links) {
        if (elt.attr("title").indexOf(unMot.get("libelleMotCle").toString().toLowerCase().trim())
            != -1) {
          pos = "a";
          String ownText = elt.ownText(); // The text where the motcle appears
          maBD.insert(
              infosSociete.get("url").toString(),
              pageCourante,
              unMot.get("idMotCle").toString(),
              pos,
              ownText);
          // ************* TROUVE LIEN TITLE ***************
          // System.out.println("a "+elt.attr("title").toString());

        }
      }
    }
    // Fin boucle mot clés

  }