/** * 计算输入句子的向量 * * @param sentence * @return */ private Double[] sentenceToVector(List<Word> words) { Double[] vector = new Double[DIMENSION]; Arrays.fill(vector, 0.0D); int count = 0; for (Word word : words) { if (CommonUtil.isPunctuation(word)) { // 跳过标点 continue; } if (STOPWORDS.contains(word.getLemma())) { // 跳过停用词 continue; } try { Vector vec = this.ehCacheUtil.getMostSimilarVec(word); if (vec == null) { // 如果在词向量中找不到当前的词向量,则跳过 continue; } Float[] floatVec = vec.floatVecs(); for (int i = 0; i < DIMENSION; i++) { vector[i] += floatVec[i]; } count++; } catch (Exception e) { log.error("Get word[" + word + "] vector error!", e); } } if (count > 0) { for (int i = 0; i < DIMENSION; i++) { vector[i] /= count; } } return vector; }
@Override public Boolean call() throws Exception { log.info( "[Thread id:" + Thread.currentThread().getId() + "] is building summary for[" + this.workDir + "/" + GlobalConstant.DIR_SENTENCES_COMPRESSION + "/" + this.filename + "]"); // 加载当前主题下面的句子,每个类别控制句子数量 Map<String, ClustItem> candidateSentences = this.loadSentences(this.sentCountInClust); // 加载每个clust的权值 String clusterWeightFilepath = this.workDir + "/" + GlobalConstant.DIR_CLUSTER_WEIGHT + "/" + this.filename.substring(0, this.filename.length() - 4) + "." + GlobalConstant.OBJ; log.info("Loading serilized file[" + clusterWeightFilepath + "]"); Map<String, Float> clusterWeights = null; try { clusterWeights = (Map<String, Float>) SerializeUtil.readObj(clusterWeightFilepath); } catch (IOException e) { log.error("Load serilized file[" + clusterWeightFilepath + "] error!", e); throw e; } log.info("Load serilized file[" + clusterWeightFilepath + "] successed!"); /* * 在保证摘要总字数不超过规定字数的前提下, 按照句子的综合得分(主题贡献分,查询覆盖度,多样性得分)循环从候选句子中选取句子 */ // 当前摘要字数 int summaryWordCount = 0; // 当前摘要包含的句子数 int summarySentenceCount = 0; // 判断候选集合中是否还有句子 boolean isNotEmpty = true; // 对问句进行分词,计算句子向量 List<Word> questionWords = StanfordNLPTools.segmentWord(this.question.trim()); Double[] questionVec = this.sentenceToVector(questionWords); /* 存放摘要的中间值,以及最终的摘要,按照clust进行组织 */ Map<String, List<Pair<Float, String>>> partialSummary = new HashMap<String, List<Pair<Float, String>>>(); /* 摘要中间值中句子向量 */ List<Double[]> psVectors = new ArrayList<Double[]>(); /* 摘要中间值中各词的词频 */ Map<String, Integer> wordFrequencyInPartialSummary = new HashMap<String, Integer>(); /* 缓存摘要中每个类的多样性得分 */ Map<String, Double> clusterDiversies = new HashMap<String, Double>(); while (isNotEmpty && summaryWordCount < MAX_SUMMARY_WORDS_COUNT) { isNotEmpty = false; // 记录当前最大的综合得分 float maxGeneralScore = Float.NEGATIVE_INFINITY; // 计算最大综合得分对应的clust名称 String selectedClustName = null; // 记录最大综合得分对应的句子的序号 Pair<Float, String> selectedSentence = null; // 记录最大综合得分对应类别的新的多样性得分 double selectedClustDiversityScore = -1.0D; for (Entry<String, ClustItem> entry : candidateSentences.entrySet()) { ClustItem clust = entry.getValue(); // 当前类别名称 String currentClustKey = clust.getName(); // 当前类别下剩余的候选句子集合 List<Pair<Float, String>> pairs = clust.getSentences(); if (CollectionUtils.isEmpty(pairs)) { // 当前类别下已没有候选句子 continue; } // 说明还有候选句子 isNotEmpty = true; // 获取当前cluster的权值 float currentClusterWeight = clusterWeights.get(currentClustKey); /* 历史多样性得分 */ float historyDiversityScore = 0.0f; /* * for (Entry<String, Double> innerEntry : * clusterDiversies.entrySet()) { historyDiversityScore += * innerEntry.getValue(); } */ // 综合得分 float generalScore = 0.0f; // 遍历处理当前类别下的句子 Iterator<Pair<Float, String>> pairItr = pairs.iterator(); while (pairItr.hasNext()) { Pair<Float, String> pair = pairItr.next(); // 1.计算当前句子的主题贡献分 float topicScore = currentClusterWeight / (pair.getLeft() * clust.getSize()); // float topicScore = 0.001f / pair.getLeft(); // 2.计算当前句子的查询覆盖度 float queryScore = 0.0f; String sentence = pair.getRight(); // 计算当前句子与问句的相似度 List<Word> words = StanfordNLPTools.segmentWord(sentence.trim()); Double[] sentVec = this.sentenceToVector(words); queryScore = (float) VectorOperator.cosineDistence(sentVec, questionVec); // 3.计算当前句子的多样性得分 double diversityScore = 0.0; // 当前句子与已有摘要的相似度得分 double similarityScore = 0.0; boolean isSame = false; // 如果候选句子中存在与当前句子在词语构成一模一样的句子则为true for (Double[] psVec : psVectors) { double sps = VectorOperator.cosineDistence(sentVec, psVec); if (sps > 1.8D) { isSame = true; break; } if (sps > 0) { similarityScore += VectorOperator.cosineDistence(sentVec, psVec); } } if (isSame) { // 说明当前句子与已经选取的句子在词语构成上相同,忽略 pairItr.remove(); continue; } if (psVectors.size() > 0) { similarityScore /= psVectors.size(); } // 计算综合得分 // topicScore = (float) (this.sigmoid(topicScore) - 0.5) * 4; topicScore = (float) (this.sigmoid(Math.log(topicScore + 1)) - 0.5) * 4; queryScore = (float) Math.log(queryScore + 1); similarityScore = Math.log(similarityScore + 1); log.debug( "[BEFORE: alpha=" + this.alpha + ", beta= " + this.beta + "]topic score:" + topicScore + ",\tquery score:" + queryScore + ",\tsimilarity score:" + similarityScore); generalScore = (float) (topicScore + this.alpha * queryScore - this.beta * similarityScore); log.debug( "[AFTER: alpha=" + this.alpha + ", beta= " + this.beta + "]topic score:" + topicScore + ",\tquery score:" + this.alpha * queryScore + ",\tsimilarity score:" + this.beta * similarityScore); if (generalScore > maxGeneralScore) { maxGeneralScore = generalScore; selectedClustName = entry.getKey(); selectedSentence = pair; log.info( "[best in clust, alpha=" + this.alpha + ", beta=" + this.beta + "]" + generalScore + "\t" + "topic score:" + topicScore + ",\tquery score:" + queryScore + ",\tsimilarity score:" + similarityScore + "\t" + pair.getRight()); selectedClustDiversityScore = diversityScore; } } } // 更新已经选择的摘要 if (null == selectedClustName || null == selectedSentence || selectedClustDiversityScore == -1) { log.warn( "Selected clust or sentence is illegal[selectedClustName = " + selectedClustName + ", selectedSentence = " + selectedSentence + "]"); continue; } // 从候选集合中选择最佳句子加入摘要集合中,同时将其从候选集合中删除 List<Pair<Float, String>> sentences = candidateSentences.get(selectedClustName).getSentences(); int num = -1; for (int i = 0; i < sentences.size(); i++) { Pair<Float, String> sent = sentences.get(i); if (selectedSentence.getRight().equals(sent.getRight())) { num = i; break; } } if (num == -1) { log.error("The sentence num is illegal:" + num); return false; } Pair<Float, String> ss = sentences.remove(num); List<Pair<Float, String>> clustSentencesInSummary = partialSummary.get(selectedClustName); if (null == clustSentencesInSummary) { clustSentencesInSummary = new ArrayList<Pair<Float, String>>(); partialSummary.put(selectedClustName, clustSentencesInSummary); System.out.println("-->\t" + ss.getRight()); } clustSentencesInSummary.add(ss); // 更新相关数据 List<Word> words = StanfordNLPTools.segmentWord(ss.getRight()); psVectors.add(this.sentenceToVector(words)); // 1.更新摘要字数 for (Word word : words) { if (CommonUtil.isPunctuation(word)) { continue; } ++summaryWordCount; } // 2.更新摘要包含的句子数 ++summarySentenceCount; // 3.更新摘要中词的词频 for (Word word : words) { Integer freq = wordFrequencyInPartialSummary.get(word.getName().toLowerCase()); if (null == freq) { freq = 0; } freq += 1; wordFrequencyInPartialSummary.put(word.getName(), freq); } // 4.更新psVectors clusterDiversies.put(selectedClustName, selectedClustDiversityScore); } // 保存摘要 StringBuilder summary = new StringBuilder(); for (Entry<String, List<Pair<Float, String>>> entry : partialSummary.entrySet()) { for (Pair<Float, String> pair : entry.getValue()) { String sentence = pair.getRight(); sentence = sentence.replaceAll("''", "").replaceAll("``", ""); sentence = sentence.replaceAll("\\s+", " "); sentence = sentence.replaceAll("\\s+'s", "'s"); /*sentence = sentence.replaceAll("-lrb-[\\s\\S]*?-rrb-\\s+", "");*/ sentence = sentence.replaceAll("-lrb-", ""); sentence = sentence.replaceAll("-rrb-", ""); sentence = sentence.endsWith(".") ? (sentence.trim() + "\n") : (sentence.trim() + ".\n"); summary.append(sentence); } } int indexOfPoint = this.filename.lastIndexOf("."); String summaryFilename = this.filename.substring(0, indexOfPoint - 1).toUpperCase() + ".M.250." + this.filename.substring(indexOfPoint - 1, indexOfPoint).toUpperCase() + ".3"; try { File file = FileUtils.getFile(this.workDir + "/" + this.numDir + DIR_SUMMARIES_V2, summaryFilename); log.info("Saving summary to file[" + file.getAbsolutePath() + "]"); FileUtils.writeStringToFile(file, summary.toString().trim(), DEFAULT_CHARSET); } catch (IOException e) { log.error("Save summary[" + this.filename + "] error!", e); throw e; } log.info( "[Thread id:" + Thread.currentThread().getId() + "] build summary for[" + this.topicname + "] finished!"); return true; }