/** 解析问题网页,将其保存到数据库,填写fragment表(过滤前的) */ public void fragment(String keyword, int pagelength) throws Exception { try { // 解析数据,数据加到数据库里面 String catalog = KeywordCatalogDesign.GetKeywordCatalog(keyword); // 得到问题网页的链接 String[] QuestionUrls = DataCollection.getQuestionURLs(keyword); // System.out.println("链接数目为:" + pagelength); // 解析每个问题网页的问题和回答 for (int j = 0; j < pagelength; j++) { String path = catalog + keyword + j + ".html"; File file = new File(path); if (!file.exists()) { System.out.println(path + " 不存在,请重新爬取数据..."); } else { System.out.println("\n开始解析: " + path); Document doc = JsoupParse.parsePathText(path); // 得到问题的各字段信息,没有作者ID,爬取时间为当前解析数据的时间 String keywordstore = keyword.replaceAll("\\+", "\\_"); // System.out.println("转义以后:" + keywordstore); String QuestionId = keywordstore + j + ""; // 得到问题ID String SourceType = "Quora"; // 得到问题来源 String URL = QuestionUrls[j]; // 得到碎片链接URL // System.out.println("url为:" + URL); String QuestionContent = FeatureExtraction.questionContent(doc) + "\n" + "Expanded information:" + FeatureExtraction.questionExpandInfo(doc); // 得到碎片内容 SimpleDateFormat df = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss"); // 设置日期格式 String CrawlerTime = df.format(new Date()); // new // Date()为获取当前系统时间 System.out.println("CrawlerTime1 is :" + CrawlerTime); String AuthorID = "0"; String media_type = "text"; // 数据是文本类型的text String evaluation = "1"; // 数据是否可用,默认都是1 // 创建sql语句 sql = "replace into fragment values (?, ?, ?, ?, ?, ?, ?, ?)"; // 创建object数组 Object[] questionobject = new Object[] { QuestionId, SourceType, URL, QuestionContent, CrawlerTime, AuthorID, media_type, evaluation }; // 执行sql语句 mysqlCon.doSql(sql, questionobject); // 获取影响行数 // i = mysqlCon.getUpdateCount(); // // 判断是否插入成功 // if (i != -1) { // System.out.println("数据插入成功!"); // } else { // System.out.println("数据插入失败!"); // } // 关闭链接 mysqlCon.getClose(); // 得到答案数据 int realanswernumber = FeatureExtraction.countRealAnswerNumber(doc); for (int m = 0; m < realanswernumber; m++) { // 得到答案的各字段信息,没有作者ID,爬取时间为当前解析数据的时间 String AnswerId = QuestionId + "_" + m; // 得到答案ID String AnswerContent = FeatureExtraction.answerContent(doc, m); ; // 得到碎片内容 // 创建object数组 Object[] answerobject = new Object[] { AnswerId, SourceType, URL, AnswerContent, CrawlerTime, AuthorID, media_type, evaluation }; mysqlCon.doSql(sql, answerobject); // i = mysqlCon.getUpdateCount(); // if (i != -1) { // System.out.println("数据插入成功!"); // } else { // System.out.println("数据插入失败!"); // } // 关闭链接 mysqlCon.getClose(); } } } } catch (Exception ex) { System.out.println("Error : " + ex.toString()); } }
/** * 填写fragment_term表(填写fragment和以前的主题词的关系表) 读取term表 * * @throws Exception */ public void fragmentTerm(String keyword, int pagelength) throws Exception { Connection conn; PreparedStatement ps; Class.forName("com.mysql.jdbc.Driver").newInstance(); // 建立到MySQL的连接(注意数据库IP) conn = DriverManager.getConnection( "jdbc:mysql://localhost:3306/knowledgeforest", "root", "199306"); // conn = DriverManager.getConnection("jdbc:mysql://202.117.16.39:3306/knowledgeforest", // "e-learning","knowledgeforest"); // conn = DriverManager.getConnection("jdbc:mysql://202.117.16.39:3306/knowledgeforestlocal", // "e-learning","knowledgeforest"); // conn = DriverManager.getConnection("jdbc:mysql://202.117.54.43:3306/knowledgeforest", // "e-learning","knowledgeforest"); // 查询term表,得到关键词表 ps = conn.prepareStatement("select * from term"); ResultSet rs = ps.executeQuery(); int length = 0; while (rs.next()) { length++; // 得到term总数 } System.out.println("共有term数目为:" + length); // 得到term表的主题词名和对应id String[] topic = new String[length]; int[] term = new int[length]; ResultSet rs1 = ps.executeQuery(); int index = 0; while (rs1.next()) { int term_id = rs1.getInt(1); String domterm_name = rs1.getString(2); topic[index] = domterm_name; term[index] = term_id; index++; } // 比较关键词和主题词,判断关键词是否在term表中是否存在 String keywordchange = keyword.replaceAll("\\+", "\\_"); for (int i = 0; i < topic.length; i++) { if (keywordchange.equals(topic[i])) { String catalog = KeywordCatalogDesign.GetKeywordCatalog(keyword); for (int j = 0; j < pagelength; j++) { String path = catalog + keyword + j + ".html"; File file = new File(path); Document doc = JsoupParse.parsePathText(path); if (!file.exists()) { System.out.println(path + " 不存在,请重新爬取数据..."); } else { String QuestionId = keywordchange + j + ""; // 得到问题ID int term_id = term[i]; // System.out.println(QuestionId + " " + term_id); // 创建sql语句 sql = "replace into fragment_term values (?, ?)"; Object[] answerobject = new Object[] {QuestionId, term_id}; mysqlCon.doSql(sql, answerobject); // i = mysqlCon.getUpdateCount(); // if (i != -1) { // System.out.println("数据插入成功!"); // } else { // System.out.println("数据插入失败!"); // } // 关闭链接 mysqlCon.getClose(); } int realanswernumber = FeatureExtraction.countRealAnswerNumber(doc); for (int m = 0; m < realanswernumber; m++) { String AnswerId = keywordchange + j + "_" + m; // 得到答案ID int term_id = term[i]; // System.out.println(AnswerId + " " + term_id); // 创建sql语句 sql = "replace into fragment_term values (?, ?)"; Object[] answerobject = new Object[] {AnswerId, term_id}; mysqlCon.doSql(sql, answerobject); // i = mysqlCon.getUpdateCount(); // if (i != -1) { // System.out.println("数据插入成功!"); // } else { // System.out.println("数据插入失败!"); // } // 关闭链接 mysqlCon.getClose(); } } } } }