@Override public Collection<Concept> getConcepts(String key) { Collection<Concept> concepts = super.getConcepts(key); if (BlankUtils.isBlank(concepts)) { concepts = autoCombineConcepts(key, null); } return concepts; }
/** * 把未登录词进行概念切分, 形成多个概念的线性链表,并倒排组织, 如“娱乐场”切分完毕后存放成: 【场】 → 【娱乐】 * * @param oov_word 未登录词 * @return */ private List<String> segmentOOV(String oov_word, int topN) { List<String> results = new LinkedList<>(); String word = oov_word; int count = 0; while (word != null && !word.equals("")) { String token = word; while (token.length() > 1 && BlankUtils.isBlank(super.getConcepts(token))) { token = token.substring(1); } results.add(token); count++; if (count >= topN) break; word = word.substring(0, (word.length() - token.length())); } return results; }
/** * 计算未登录词语oov_word自动组合语义 * * @param oov_word 未登录词,此处指知网概念中未出现的词语,需要进行切分,求解组合语义, 组合的语义关系通过参照概念refConcepts进行修正 * @param refConcepts 简单计算出的oov_word的概念定义,需要通过refConcepts修正义原之间的符号、关系等 * @return */ public Collection<Concept> autoCombineConcepts(String oov_word, Collection<Concept> refConcepts) { ConceptLinkedList oovConcepts = new ConceptLinkedList(); if (oov_word == null) { return oovConcepts; } // 只获取倒排后的三个未识别词语,如果太多了,一方面会影响运行速度,另一方面组合过多的意义也不是很有用 for (String concept_word : segmentOOV(oov_word, 3)) { Collection<Concept> concepts = super.getConcepts(concept_word); if (oovConcepts.size() == 0) { oovConcepts.addAll(concepts); continue; } ConceptLinkedList tmpConcepts = new ConceptLinkedList(); for (Concept head : concepts) { for (Concept tail : oovConcepts) { if (!BlankUtils.isBlank(refConcepts)) { for (Concept ref : refConcepts) { tmpConcepts.addByDefine(autoCombineConcept(head, tail, ref)); } } else { tmpConcepts.addByDefine(autoCombineConcept(head, tail, null)); } } } oovConcepts = tmpConcepts; } /** 如果组合过多,则删除最后的1/3个组合 */ if ((oovConcepts.size() > MAX_COMBINED_COUNT)) { oovConcepts.removeLast(MAX_COMBINED_COUNT / 3); } return oovConcepts; }
/** * 获取两个词语的相似度,如果一个词语对应多个概念,则返回相似度最大的一对 * * @param word1 * @param word2 * @return */ @Override public double getSimilarity(String word1, String word2) { double similarity = 0.0; // 如果两个词语相同,则直接返回1.0 if (word1.equals(word2)) { return 1.0; } Collection<Concept> concepts1 = super.getConcepts(word1); Collection<Concept> concepts2 = super.getConcepts(word2); // 如果是blank,则说明是未登录词, 需要计算组合概念 if (BlankUtils.isBlank(concepts1) && !BlankUtils.isBlank(concepts2)) { concepts1 = autoCombineConcepts(word1, concepts2); } else if (BlankUtils.isBlank(concepts2) && !BlankUtils.isBlank(concepts1)) { concepts2 = autoCombineConcepts(word2, concepts1); } else if (BlankUtils.isBlank(concepts1) && BlankUtils.isBlank(concepts2)) { concepts1 = autoCombineConcepts(word1, concepts2); concepts2 = autoCombineConcepts(word2, concepts1); // 相互修正 concepts1 = autoCombineConcepts(word1, concepts2); concepts2 = autoCombineConcepts(word2, concepts1); } // 两个for循环分别计算词语所有可能的概念的相似度 for (Concept c1 : concepts1) { for (Concept c2 : concepts2) { double v = getSimilarity(c1, c2); if (v > similarity) { similarity = v; } if (similarity == 1.0) { break; } } } return similarity; }