private void fillPendingQueue(Queue<ScoredCategory> queue, ScoredCategory scat, int maxDistance) { Category cat = scat.getCategory(); int newDistance = scat.getDistance() + 1; if (newDistance <= maxDistance) { for (Category subCat : cat.getChildren()) { ScoredCategory newSCat = GroupOfCategories.createScoredCategory(subCat, cat, newDistance); queue.add(newSCat); } } }
/** * Obtains the categories of all the groups which compose a domain. * * @param scoredGroups List of {@code GroupOfCategories} to check. * @param threshold Minimum score allowed. * @return Set of categories which belong to a specific domain. */ public Set<Category> getDomainCategories(List<GroupOfCategories> scoredGroups, double threshold) { HashSet<Category> domainCategories = new HashSet<Category>(); int lastIndex = Integer.MIN_VALUE; for (int index = 0; index < scoredGroups.size(); index++) { if (scoredGroups.get(index).getScore() >= threshold) { lastIndex = index; } } for (int index = 0; index <= lastIndex; index++) { GroupOfCategories goc = scoredGroups.get(index); for (ScoredCategory scat : goc.getCategories()) { domainCategories.add(scat.getCategory()); } } return domainCategories; }
/** * Scores the groups which compose a domain and are at most as far as the {@code maxDistance} * parameter defines. The score of each group is calculated using {@link * GroupOfCategories#getScore()}. A group of categories is defined in this case as the set of * categories which are at the same distance from a root category. * * @param vocabulary The vocabulary of the domain. * @param root The category which defines the domain. * @param maxDistance Maximum distance from the root category where is allowed tro explore. * @return A list with all the groups of categories which can be accessed from the given root and * are at most at {@code maxDistance} from the root category . All the groups has been scored * and are indexed by the distance from the root category. It means, the position 0 of the * list only contains the root category because it is the unique category at distance 0 from * itself; the position {@code i} contains all the categories at distance {@code i} from the * root category. */ public ArrayList<GroupOfCategories> scoreDomain( DomainVocabulary vocabulary, Category root, int maxDistance) { HashSet<Integer> visitedCategories = new HashSet<Integer>(); ArrayList<GroupOfCategories> scores = new ArrayList<GroupOfCategories>(); LinkedBlockingQueue<ScoredCategory> pendingCategories = new LinkedBlockingQueue<ScoredCategory>(); // Initialize structures with root category GroupOfCategories goc = new GroupOfCategories(root); ScoredCategory scat = GroupOfCategories.createScoredCategory(root, null, 0, true); goc.addScoredCategory(scat); scores.add(goc); fillPendingQueue(pendingCategories, scat, maxDistance); visitedCategories.add(root.getPageId()); // Explore the queued categories while (!pendingCategories.isEmpty()) { scat = pendingCategories.poll(); Category currentCat = scat.getCategory(); if (!visitedCategories.contains(currentCat.getPageId())) { boolean domain; try { domain = isDomain(currentCat, vocabulary); } catch (WikiTitleParsingException e) { domain = false; } scat.setDomain(domain); int currentDistance = scat.getDistance(); if (currentDistance < scores.size()) { scores.get(currentDistance).addScoredCategory(scat); } else { goc = new GroupOfCategories(root); goc.addScoredCategory(scat); scores.add(goc); } fillPendingQueue(pendingCategories, scat, maxDistance); visitedCategories.add(root.getPageId()); } else { continue; } } return scores; }