コード例 #1
0
  /** 解析问题网页,将其保存到数据库,填写fragment表(过滤前的) */
  public void fragment(String keyword, int pagelength) throws Exception {
    try {
      // 解析数据,数据加到数据库里面
      String catalog = KeywordCatalogDesign.GetKeywordCatalog(keyword);

      // 得到问题网页的链接
      String[] QuestionUrls = DataCollection.getQuestionURLs(keyword);
      // System.out.println("链接数目为:" + pagelength);

      // 解析每个问题网页的问题和回答
      for (int j = 0; j < pagelength; j++) {
        String path = catalog + keyword + j + ".html";
        File file = new File(path);
        if (!file.exists()) {
          System.out.println(path + "  不存在,请重新爬取数据...");
        } else {
          System.out.println("\n开始解析: " + path);
          Document doc = JsoupParse.parsePathText(path);

          // 得到问题的各字段信息,没有作者ID,爬取时间为当前解析数据的时间
          String keywordstore = keyword.replaceAll("\\+", "\\_");
          // System.out.println("转义以后:" + keywordstore);
          String QuestionId = keywordstore + j + ""; // 得到问题ID
          String SourceType = "Quora"; // 得到问题来源
          String URL = QuestionUrls[j]; // 得到碎片链接URL
          // System.out.println("url为:" + URL);
          String QuestionContent =
              FeatureExtraction.questionContent(doc)
                  + "\n"
                  + "Expanded information:"
                  + FeatureExtraction.questionExpandInfo(doc); // 得到碎片内容
          SimpleDateFormat df = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss"); // 设置日期格式
          String CrawlerTime = df.format(new Date()); // new
          // Date()为获取当前系统时间
          System.out.println("CrawlerTime1 is :" + CrawlerTime);
          String AuthorID = "0";
          String media_type = "text"; // 数据是文本类型的text
          String evaluation = "1"; // 数据是否可用,默认都是1

          // 创建sql语句
          sql = "replace into fragment values (?, ?, ?, ?, ?, ?, ?, ?)";
          // 创建object数组
          Object[] questionobject =
              new Object[] {
                QuestionId,
                SourceType,
                URL,
                QuestionContent,
                CrawlerTime,
                AuthorID,
                media_type,
                evaluation
              };
          // 执行sql语句
          mysqlCon.doSql(sql, questionobject);
          // 获取影响行数
          //					i = mysqlCon.getUpdateCount();
          //					// 判断是否插入成功
          //					if (i != -1) {
          //						System.out.println("数据插入成功!");
          //					} else {
          //						System.out.println("数据插入失败!");
          //					}
          // 关闭链接
          mysqlCon.getClose();

          // 得到答案数据
          int realanswernumber = FeatureExtraction.countRealAnswerNumber(doc);
          for (int m = 0; m < realanswernumber; m++) {

            // 得到答案的各字段信息,没有作者ID,爬取时间为当前解析数据的时间
            String AnswerId = QuestionId + "_" + m; // 得到答案ID
            String AnswerContent = FeatureExtraction.answerContent(doc, m);
            ; // 得到碎片内容

            // 创建object数组
            Object[] answerobject =
                new Object[] {
                  AnswerId,
                  SourceType,
                  URL,
                  AnswerContent,
                  CrawlerTime,
                  AuthorID,
                  media_type,
                  evaluation
                };
            mysqlCon.doSql(sql, answerobject);
            //						i = mysqlCon.getUpdateCount();
            //						if (i != -1) {
            //							System.out.println("数据插入成功!");
            //						} else {
            //							System.out.println("数据插入失败!");
            //						}
            // 关闭链接
            mysqlCon.getClose();
          }
        }
      }
    } catch (Exception ex) {
      System.out.println("Error : " + ex.toString());
    }
  }
コード例 #2
0
  /**
   * 填写fragment_term表(填写fragment和以前的主题词的关系表) 读取term表
   *
   * @throws Exception
   */
  public void fragmentTerm(String keyword, int pagelength) throws Exception {
    Connection conn;
    PreparedStatement ps;
    Class.forName("com.mysql.jdbc.Driver").newInstance();

    // 建立到MySQL的连接(注意数据库IP)
    conn =
        DriverManager.getConnection(
            "jdbc:mysql://localhost:3306/knowledgeforest", "root", "199306");
    //		conn = DriverManager.getConnection("jdbc:mysql://202.117.16.39:3306/knowledgeforest",
    // "e-learning","knowledgeforest");
    //		conn = DriverManager.getConnection("jdbc:mysql://202.117.16.39:3306/knowledgeforestlocal",
    // "e-learning","knowledgeforest");
    //		conn = DriverManager.getConnection("jdbc:mysql://202.117.54.43:3306/knowledgeforest",
    // "e-learning","knowledgeforest");

    // 查询term表,得到关键词表
    ps = conn.prepareStatement("select * from term");
    ResultSet rs = ps.executeQuery();
    int length = 0;
    while (rs.next()) {
      length++; // 得到term总数
    }
    System.out.println("共有term数目为:" + length);

    // 得到term表的主题词名和对应id
    String[] topic = new String[length];
    int[] term = new int[length];
    ResultSet rs1 = ps.executeQuery();
    int index = 0;
    while (rs1.next()) {
      int term_id = rs1.getInt(1);
      String domterm_name = rs1.getString(2);
      topic[index] = domterm_name;
      term[index] = term_id;
      index++;
    }

    // 比较关键词和主题词,判断关键词是否在term表中是否存在
    String keywordchange = keyword.replaceAll("\\+", "\\_");
    for (int i = 0; i < topic.length; i++) {
      if (keywordchange.equals(topic[i])) {
        String catalog = KeywordCatalogDesign.GetKeywordCatalog(keyword);
        for (int j = 0; j < pagelength; j++) {
          String path = catalog + keyword + j + ".html";
          File file = new File(path);
          Document doc = JsoupParse.parsePathText(path);
          if (!file.exists()) {
            System.out.println(path + "  不存在,请重新爬取数据...");
          } else {
            String QuestionId = keywordchange + j + ""; // 得到问题ID
            int term_id = term[i];
            //						System.out.println(QuestionId + "   " + term_id);
            // 创建sql语句
            sql = "replace into fragment_term values (?, ?)";
            Object[] answerobject = new Object[] {QuestionId, term_id};
            mysqlCon.doSql(sql, answerobject);
            //						i = mysqlCon.getUpdateCount();
            //						if (i != -1) {
            //							System.out.println("数据插入成功!");
            //						} else {
            //							System.out.println("数据插入失败!");
            //						}
            // 关闭链接
            mysqlCon.getClose();
          }
          int realanswernumber = FeatureExtraction.countRealAnswerNumber(doc);
          for (int m = 0; m < realanswernumber; m++) {
            String AnswerId = keywordchange + j + "_" + m; // 得到答案ID
            int term_id = term[i];
            //						System.out.println(AnswerId + "   " + term_id);
            // 创建sql语句
            sql = "replace into fragment_term values (?, ?)";
            Object[] answerobject = new Object[] {AnswerId, term_id};
            mysqlCon.doSql(sql, answerobject);
            //						i = mysqlCon.getUpdateCount();
            //						if (i != -1) {
            //							System.out.println("数据插入成功!");
            //						} else {
            //							System.out.println("数据插入失败!");
            //						}
            // 关闭链接
            mysqlCon.getClose();
          }
        }
      }
    }
  }