/** * 填写主题词表(自己爬取的数据结构的54个主题词) * * <p>(暂时没有用处) * * @throws Exception */ public void topic(String course) throws Exception { // 填表 File file0 = new File("file/datacollection/" + course); // File file0 = new File("f:/术语/课程术语/" + course); File[] files = file0.listFiles(); for (int i = 0; i < files.length; i++) { int topic_id = i + 1; String topic_name = files[i].getName(); topic_name = URLDecoder.decode(topic_name, "UTF-8"); // 向MYSQL里面添加记录 // 创建sql语句 sql = "insert into domain_topic values (?, ?)"; // 创建object数组 Object[] answerobject = new Object[] {topic_id, topic_name}; // 执行sql语句 mysqlCon.doSql(sql, answerobject); // 获取影响行数 i = mysqlCon.getUpdateCount(); // 判断是否插入成功 if (i != -1) { System.out.println("数据插入成功!"); } else { System.out.println("数据插入失败!"); } // 关闭链接 mysqlCon.getClose(); System.out.println(topic_id + " " + topic_name); } }
/** 解析问题网页,将其保存到数据库,填写fragment表(过滤前的) */ public void fragment(String keyword, int pagelength) throws Exception { try { // 解析数据,数据加到数据库里面 String catalog = KeywordCatalogDesign.GetKeywordCatalog(keyword); // 得到问题网页的链接 String[] QuestionUrls = DataCollection.getQuestionURLs(keyword); // System.out.println("链接数目为:" + pagelength); // 解析每个问题网页的问题和回答 for (int j = 0; j < pagelength; j++) { String path = catalog + keyword + j + ".html"; File file = new File(path); if (!file.exists()) { System.out.println(path + " 不存在,请重新爬取数据..."); } else { System.out.println("\n开始解析: " + path); Document doc = JsoupParse.parsePathText(path); // 得到问题的各字段信息,没有作者ID,爬取时间为当前解析数据的时间 String keywordstore = keyword.replaceAll("\\+", "\\_"); // System.out.println("转义以后:" + keywordstore); String QuestionId = keywordstore + j + ""; // 得到问题ID String SourceType = "Quora"; // 得到问题来源 String URL = QuestionUrls[j]; // 得到碎片链接URL // System.out.println("url为:" + URL); String QuestionContent = FeatureExtraction.questionContent(doc) + "\n" + "Expanded information:" + FeatureExtraction.questionExpandInfo(doc); // 得到碎片内容 SimpleDateFormat df = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss"); // 设置日期格式 String CrawlerTime = df.format(new Date()); // new // Date()为获取当前系统时间 System.out.println("CrawlerTime1 is :" + CrawlerTime); String AuthorID = "0"; String media_type = "text"; // 数据是文本类型的text String evaluation = "1"; // 数据是否可用,默认都是1 // 创建sql语句 sql = "replace into fragment values (?, ?, ?, ?, ?, ?, ?, ?)"; // 创建object数组 Object[] questionobject = new Object[] { QuestionId, SourceType, URL, QuestionContent, CrawlerTime, AuthorID, media_type, evaluation }; // 执行sql语句 mysqlCon.doSql(sql, questionobject); // 获取影响行数 // i = mysqlCon.getUpdateCount(); // // 判断是否插入成功 // if (i != -1) { // System.out.println("数据插入成功!"); // } else { // System.out.println("数据插入失败!"); // } // 关闭链接 mysqlCon.getClose(); // 得到答案数据 int realanswernumber = FeatureExtraction.countRealAnswerNumber(doc); for (int m = 0; m < realanswernumber; m++) { // 得到答案的各字段信息,没有作者ID,爬取时间为当前解析数据的时间 String AnswerId = QuestionId + "_" + m; // 得到答案ID String AnswerContent = FeatureExtraction.answerContent(doc, m); ; // 得到碎片内容 // 创建object数组 Object[] answerobject = new Object[] { AnswerId, SourceType, URL, AnswerContent, CrawlerTime, AuthorID, media_type, evaluation }; mysqlCon.doSql(sql, answerobject); // i = mysqlCon.getUpdateCount(); // if (i != -1) { // System.out.println("数据插入成功!"); // } else { // System.out.println("数据插入失败!"); // } // 关闭链接 mysqlCon.getClose(); } } } } catch (Exception ex) { System.out.println("Error : " + ex.toString()); } }
/** * 填写fragment_term表(填写fragment和以前的主题词的关系表) 读取term表 * * @throws Exception */ public void fragmentTerm(String keyword, int pagelength) throws Exception { Connection conn; PreparedStatement ps; Class.forName("com.mysql.jdbc.Driver").newInstance(); // 建立到MySQL的连接(注意数据库IP) conn = DriverManager.getConnection( "jdbc:mysql://localhost:3306/knowledgeforest", "root", "199306"); // conn = DriverManager.getConnection("jdbc:mysql://202.117.16.39:3306/knowledgeforest", // "e-learning","knowledgeforest"); // conn = DriverManager.getConnection("jdbc:mysql://202.117.16.39:3306/knowledgeforestlocal", // "e-learning","knowledgeforest"); // conn = DriverManager.getConnection("jdbc:mysql://202.117.54.43:3306/knowledgeforest", // "e-learning","knowledgeforest"); // 查询term表,得到关键词表 ps = conn.prepareStatement("select * from term"); ResultSet rs = ps.executeQuery(); int length = 0; while (rs.next()) { length++; // 得到term总数 } System.out.println("共有term数目为:" + length); // 得到term表的主题词名和对应id String[] topic = new String[length]; int[] term = new int[length]; ResultSet rs1 = ps.executeQuery(); int index = 0; while (rs1.next()) { int term_id = rs1.getInt(1); String domterm_name = rs1.getString(2); topic[index] = domterm_name; term[index] = term_id; index++; } // 比较关键词和主题词,判断关键词是否在term表中是否存在 String keywordchange = keyword.replaceAll("\\+", "\\_"); for (int i = 0; i < topic.length; i++) { if (keywordchange.equals(topic[i])) { String catalog = KeywordCatalogDesign.GetKeywordCatalog(keyword); for (int j = 0; j < pagelength; j++) { String path = catalog + keyword + j + ".html"; File file = new File(path); Document doc = JsoupParse.parsePathText(path); if (!file.exists()) { System.out.println(path + " 不存在,请重新爬取数据..."); } else { String QuestionId = keywordchange + j + ""; // 得到问题ID int term_id = term[i]; // System.out.println(QuestionId + " " + term_id); // 创建sql语句 sql = "replace into fragment_term values (?, ?)"; Object[] answerobject = new Object[] {QuestionId, term_id}; mysqlCon.doSql(sql, answerobject); // i = mysqlCon.getUpdateCount(); // if (i != -1) { // System.out.println("数据插入成功!"); // } else { // System.out.println("数据插入失败!"); // } // 关闭链接 mysqlCon.getClose(); } int realanswernumber = FeatureExtraction.countRealAnswerNumber(doc); for (int m = 0; m < realanswernumber; m++) { String AnswerId = keywordchange + j + "_" + m; // 得到答案ID int term_id = term[i]; // System.out.println(AnswerId + " " + term_id); // 创建sql语句 sql = "replace into fragment_term values (?, ?)"; Object[] answerobject = new Object[] {AnswerId, term_id}; mysqlCon.doSql(sql, answerobject); // i = mysqlCon.getUpdateCount(); // if (i != -1) { // System.out.println("数据插入成功!"); // } else { // System.out.println("数据插入失败!"); // } // 关闭链接 mysqlCon.getClose(); } } } } }
/** 解析碎片与分面之间的关系,填写assemble表(表格变化) (针对二叉树Binary_tree) */ public void assemble(String keyword) throws Exception { try { // 解析数据,数据加到数据库里面 String catalog = KeywordCatalogDesign.GetKeywordCatalog(keyword); String path = catalog + keyword + "-tag_changed.xls"; File file = new File(path); if (!file.exists()) { System.out.println(path + " 不存在,请重新生成解析表格..."); } else { System.out.println("\n开始对应: " + path); Workbook book = Workbook.getWorkbook(file); Sheet sheet = book.getSheet(0); int row = sheet.getRows(); String[] facetname = { "definition", "feature", "implementation", "example", "operation", "application", "method", "type", "relevant", "history", "description", "purpose", "explanation", "storage", "simulator" }; for (int i = 0; i < row; i++) { Cell cell0 = sheet.getCell(0, i); Cell cell12 = sheet.getCell(12, i); if (cell12.getContents().equals("1")) { String fragmentid = cell0.getContents().replaceAll("\\+", "\\_"); int facetid; Cell cell13 = sheet.getCell(13, i); Cell cell14 = sheet.getCell(14, i); Cell cell15 = sheet.getCell(15, i); Cell cell16 = sheet.getCell(16, i); String[] tag = { cell14.getContents(), cell15.getContents(), cell16.getContents(), cell13.getContents() }; for (int m = 0; m < tag.length; m++) { for (int n = 0; n < facetname.length; n++) { if (tag[m].equals(facetname[n])) { facetid = n + 1; sql = "replace into assemble values (?, ?, ?, ?, ?)"; Object[] answerobject = new Object[] {52, facetid, 1, fragmentid, " "}; mysqlCon.doSql(sql, answerobject); // i = mysqlCon.getUpdateCount(); // if (i != -1) { // System.out.println("数据插入成功!"); // } else { // System.out.println("数据插入失败!"); // } // 关闭链接 mysqlCon.getClose(); System.out.println(fragmentid + " " + facetid + " 52"); } } } } } } } catch (Exception ex) { System.out.println("Error : " + ex.toString()); } }