コード例 #1
0
  /**
   * 填写主题词表(自己爬取的数据结构的54个主题词)
   *
   * <p>(暂时没有用处)
   *
   * @throws Exception
   */
  public void topic(String course) throws Exception {
    // 填表
    File file0 = new File("file/datacollection/" + course);
    // File file0 = new File("f:/术语/课程术语/" + course);
    File[] files = file0.listFiles();
    for (int i = 0; i < files.length; i++) {
      int topic_id = i + 1;
      String topic_name = files[i].getName();
      topic_name = URLDecoder.decode(topic_name, "UTF-8");
      // 向MYSQL里面添加记录

      // 创建sql语句
      sql = "insert into domain_topic values (?, ?)";
      // 创建object数组
      Object[] answerobject = new Object[] {topic_id, topic_name};
      // 执行sql语句
      mysqlCon.doSql(sql, answerobject);
      // 获取影响行数
      i = mysqlCon.getUpdateCount();
      // 判断是否插入成功
      if (i != -1) {
        System.out.println("数据插入成功!");
      } else {
        System.out.println("数据插入失败!");
      }
      // 关闭链接
      mysqlCon.getClose();

      System.out.println(topic_id + "   " + topic_name);
    }
  }
コード例 #2
0
  /** 解析问题网页,将其保存到数据库,填写fragment表(过滤前的) */
  public void fragment(String keyword, int pagelength) throws Exception {
    try {
      // 解析数据,数据加到数据库里面
      String catalog = KeywordCatalogDesign.GetKeywordCatalog(keyword);

      // 得到问题网页的链接
      String[] QuestionUrls = DataCollection.getQuestionURLs(keyword);
      // System.out.println("链接数目为:" + pagelength);

      // 解析每个问题网页的问题和回答
      for (int j = 0; j < pagelength; j++) {
        String path = catalog + keyword + j + ".html";
        File file = new File(path);
        if (!file.exists()) {
          System.out.println(path + "  不存在,请重新爬取数据...");
        } else {
          System.out.println("\n开始解析: " + path);
          Document doc = JsoupParse.parsePathText(path);

          // 得到问题的各字段信息,没有作者ID,爬取时间为当前解析数据的时间
          String keywordstore = keyword.replaceAll("\\+", "\\_");
          // System.out.println("转义以后:" + keywordstore);
          String QuestionId = keywordstore + j + ""; // 得到问题ID
          String SourceType = "Quora"; // 得到问题来源
          String URL = QuestionUrls[j]; // 得到碎片链接URL
          // System.out.println("url为:" + URL);
          String QuestionContent =
              FeatureExtraction.questionContent(doc)
                  + "\n"
                  + "Expanded information:"
                  + FeatureExtraction.questionExpandInfo(doc); // 得到碎片内容
          SimpleDateFormat df = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss"); // 设置日期格式
          String CrawlerTime = df.format(new Date()); // new
          // Date()为获取当前系统时间
          System.out.println("CrawlerTime1 is :" + CrawlerTime);
          String AuthorID = "0";
          String media_type = "text"; // 数据是文本类型的text
          String evaluation = "1"; // 数据是否可用,默认都是1

          // 创建sql语句
          sql = "replace into fragment values (?, ?, ?, ?, ?, ?, ?, ?)";
          // 创建object数组
          Object[] questionobject =
              new Object[] {
                QuestionId,
                SourceType,
                URL,
                QuestionContent,
                CrawlerTime,
                AuthorID,
                media_type,
                evaluation
              };
          // 执行sql语句
          mysqlCon.doSql(sql, questionobject);
          // 获取影响行数
          //					i = mysqlCon.getUpdateCount();
          //					// 判断是否插入成功
          //					if (i != -1) {
          //						System.out.println("数据插入成功!");
          //					} else {
          //						System.out.println("数据插入失败!");
          //					}
          // 关闭链接
          mysqlCon.getClose();

          // 得到答案数据
          int realanswernumber = FeatureExtraction.countRealAnswerNumber(doc);
          for (int m = 0; m < realanswernumber; m++) {

            // 得到答案的各字段信息,没有作者ID,爬取时间为当前解析数据的时间
            String AnswerId = QuestionId + "_" + m; // 得到答案ID
            String AnswerContent = FeatureExtraction.answerContent(doc, m);
            ; // 得到碎片内容

            // 创建object数组
            Object[] answerobject =
                new Object[] {
                  AnswerId,
                  SourceType,
                  URL,
                  AnswerContent,
                  CrawlerTime,
                  AuthorID,
                  media_type,
                  evaluation
                };
            mysqlCon.doSql(sql, answerobject);
            //						i = mysqlCon.getUpdateCount();
            //						if (i != -1) {
            //							System.out.println("数据插入成功!");
            //						} else {
            //							System.out.println("数据插入失败!");
            //						}
            // 关闭链接
            mysqlCon.getClose();
          }
        }
      }
    } catch (Exception ex) {
      System.out.println("Error : " + ex.toString());
    }
  }
コード例 #3
0
  /**
   * 填写fragment_term表(填写fragment和以前的主题词的关系表) 读取term表
   *
   * @throws Exception
   */
  public void fragmentTerm(String keyword, int pagelength) throws Exception {
    Connection conn;
    PreparedStatement ps;
    Class.forName("com.mysql.jdbc.Driver").newInstance();

    // 建立到MySQL的连接(注意数据库IP)
    conn =
        DriverManager.getConnection(
            "jdbc:mysql://localhost:3306/knowledgeforest", "root", "199306");
    //		conn = DriverManager.getConnection("jdbc:mysql://202.117.16.39:3306/knowledgeforest",
    // "e-learning","knowledgeforest");
    //		conn = DriverManager.getConnection("jdbc:mysql://202.117.16.39:3306/knowledgeforestlocal",
    // "e-learning","knowledgeforest");
    //		conn = DriverManager.getConnection("jdbc:mysql://202.117.54.43:3306/knowledgeforest",
    // "e-learning","knowledgeforest");

    // 查询term表,得到关键词表
    ps = conn.prepareStatement("select * from term");
    ResultSet rs = ps.executeQuery();
    int length = 0;
    while (rs.next()) {
      length++; // 得到term总数
    }
    System.out.println("共有term数目为:" + length);

    // 得到term表的主题词名和对应id
    String[] topic = new String[length];
    int[] term = new int[length];
    ResultSet rs1 = ps.executeQuery();
    int index = 0;
    while (rs1.next()) {
      int term_id = rs1.getInt(1);
      String domterm_name = rs1.getString(2);
      topic[index] = domterm_name;
      term[index] = term_id;
      index++;
    }

    // 比较关键词和主题词,判断关键词是否在term表中是否存在
    String keywordchange = keyword.replaceAll("\\+", "\\_");
    for (int i = 0; i < topic.length; i++) {
      if (keywordchange.equals(topic[i])) {
        String catalog = KeywordCatalogDesign.GetKeywordCatalog(keyword);
        for (int j = 0; j < pagelength; j++) {
          String path = catalog + keyword + j + ".html";
          File file = new File(path);
          Document doc = JsoupParse.parsePathText(path);
          if (!file.exists()) {
            System.out.println(path + "  不存在,请重新爬取数据...");
          } else {
            String QuestionId = keywordchange + j + ""; // 得到问题ID
            int term_id = term[i];
            //						System.out.println(QuestionId + "   " + term_id);
            // 创建sql语句
            sql = "replace into fragment_term values (?, ?)";
            Object[] answerobject = new Object[] {QuestionId, term_id};
            mysqlCon.doSql(sql, answerobject);
            //						i = mysqlCon.getUpdateCount();
            //						if (i != -1) {
            //							System.out.println("数据插入成功!");
            //						} else {
            //							System.out.println("数据插入失败!");
            //						}
            // 关闭链接
            mysqlCon.getClose();
          }
          int realanswernumber = FeatureExtraction.countRealAnswerNumber(doc);
          for (int m = 0; m < realanswernumber; m++) {
            String AnswerId = keywordchange + j + "_" + m; // 得到答案ID
            int term_id = term[i];
            //						System.out.println(AnswerId + "   " + term_id);
            // 创建sql语句
            sql = "replace into fragment_term values (?, ?)";
            Object[] answerobject = new Object[] {AnswerId, term_id};
            mysqlCon.doSql(sql, answerobject);
            //						i = mysqlCon.getUpdateCount();
            //						if (i != -1) {
            //							System.out.println("数据插入成功!");
            //						} else {
            //							System.out.println("数据插入失败!");
            //						}
            // 关闭链接
            mysqlCon.getClose();
          }
        }
      }
    }
  }
コード例 #4
0
  /** 解析碎片与分面之间的关系,填写assemble表(表格变化) (针对二叉树Binary_tree) */
  public void assemble(String keyword) throws Exception {
    try {
      // 解析数据,数据加到数据库里面
      String catalog = KeywordCatalogDesign.GetKeywordCatalog(keyword);
      String path = catalog + keyword + "-tag_changed.xls";
      File file = new File(path);
      if (!file.exists()) {
        System.out.println(path + "  不存在,请重新生成解析表格...");
      } else {
        System.out.println("\n开始对应: " + path);
        Workbook book = Workbook.getWorkbook(file);
        Sheet sheet = book.getSheet(0);
        int row = sheet.getRows();
        String[] facetname = {
          "definition",
          "feature",
          "implementation",
          "example",
          "operation",
          "application",
          "method",
          "type",
          "relevant",
          "history",
          "description",
          "purpose",
          "explanation",
          "storage",
          "simulator"
        };
        for (int i = 0; i < row; i++) {
          Cell cell0 = sheet.getCell(0, i);
          Cell cell12 = sheet.getCell(12, i);
          if (cell12.getContents().equals("1")) {
            String fragmentid = cell0.getContents().replaceAll("\\+", "\\_");
            int facetid;
            Cell cell13 = sheet.getCell(13, i);
            Cell cell14 = sheet.getCell(14, i);
            Cell cell15 = sheet.getCell(15, i);
            Cell cell16 = sheet.getCell(16, i);
            String[] tag = {
              cell14.getContents(), cell15.getContents(), cell16.getContents(), cell13.getContents()
            };
            for (int m = 0; m < tag.length; m++) {
              for (int n = 0; n < facetname.length; n++) {
                if (tag[m].equals(facetname[n])) {
                  facetid = n + 1;

                  sql = "replace into assemble values (?, ?, ?, ?, ?)";
                  Object[] answerobject = new Object[] {52, facetid, 1, fragmentid, " "};
                  mysqlCon.doSql(sql, answerobject);
                  //									i = mysqlCon.getUpdateCount();
                  //									if (i != -1) {
                  //										System.out.println("数据插入成功!");
                  //									} else {
                  //										System.out.println("数据插入失败!");
                  //									}
                  // 关闭链接
                  mysqlCon.getClose();

                  System.out.println(fragmentid + "   " + facetid + "  52");
                }
              }
            }
          }
        }
      }
    } catch (Exception ex) {
      System.out.println("Error : " + ex.toString());
    }
  }