private void fillPendingQueue(Queue<ScoredCategory> queue, ScoredCategory scat, int maxDistance) { Category cat = scat.getCategory(); int newDistance = scat.getDistance() + 1; if (newDistance <= maxDistance) { for (Category subCat : cat.getChildren()) { ScoredCategory newSCat = GroupOfCategories.createScoredCategory(subCat, cat, newDistance); queue.add(newSCat); } } }
/** * Scores the groups which compose a domain and are at most as far as the {@code maxDistance} * parameter defines. The score of each group is calculated using {@link * GroupOfCategories#getScore()}. A group of categories is defined in this case as the set of * categories which are at the same distance from a root category. * * @param vocabulary The vocabulary of the domain. * @param root The category which defines the domain. * @param maxDistance Maximum distance from the root category where is allowed tro explore. * @return A list with all the groups of categories which can be accessed from the given root and * are at most at {@code maxDistance} from the root category . All the groups has been scored * and are indexed by the distance from the root category. It means, the position 0 of the * list only contains the root category because it is the unique category at distance 0 from * itself; the position {@code i} contains all the categories at distance {@code i} from the * root category. */ public ArrayList<GroupOfCategories> scoreDomain( DomainVocabulary vocabulary, Category root, int maxDistance) { HashSet<Integer> visitedCategories = new HashSet<Integer>(); ArrayList<GroupOfCategories> scores = new ArrayList<GroupOfCategories>(); LinkedBlockingQueue<ScoredCategory> pendingCategories = new LinkedBlockingQueue<ScoredCategory>(); // Initialize structures with root category GroupOfCategories goc = new GroupOfCategories(root); ScoredCategory scat = GroupOfCategories.createScoredCategory(root, null, 0, true); goc.addScoredCategory(scat); scores.add(goc); fillPendingQueue(pendingCategories, scat, maxDistance); visitedCategories.add(root.getPageId()); // Explore the queued categories while (!pendingCategories.isEmpty()) { scat = pendingCategories.poll(); Category currentCat = scat.getCategory(); if (!visitedCategories.contains(currentCat.getPageId())) { boolean domain; try { domain = isDomain(currentCat, vocabulary); } catch (WikiTitleParsingException e) { domain = false; } scat.setDomain(domain); int currentDistance = scat.getDistance(); if (currentDistance < scores.size()) { scores.get(currentDistance).addScoredCategory(scat); } else { goc = new GroupOfCategories(root); goc.addScoredCategory(scat); scores.add(goc); } fillPendingQueue(pendingCategories, scat, maxDistance); visitedCategories.add(root.getPageId()); } else { continue; } } return scores; }
public static void main(String[] args) throws WikiApiException { // configure the database connection parameters DatabaseConfiguration dbConfig = new DatabaseConfiguration(); dbConfig.setHost("SERVER_URL"); dbConfig.setDatabase("DATABASE"); dbConfig.setUser("USER"); dbConfig.setPassword("PASSWORD"); dbConfig.setLanguage(Language.german); // Create a new German wikipedia. Wikipedia wiki = new Wikipedia(dbConfig); // Get the category "Towns in Germany" String title = "Towns in Germany"; Category topCat; try { topCat = wiki.getCategory(title); } catch (WikiPageNotFoundException e) { throw new WikiApiException("Category " + title + " does not exist"); } // Add the pages categorized under "Towns in Germany". Set<String> towns = new TreeSet<String>(); for (Page p : topCat.getArticles()) { towns.add(p.getTitle().getPlainTitle()); } // Get the pages categorized under each subcategory of "Towns in Germany". for (Category townCategory : topCat.getDescendants()) { for (Page p : townCategory.getArticles()) { towns.add(p.getTitle().getPlainTitle()); } System.out.println("Number of towns: " + towns.size()); } // Output the pages for (String town : towns) { System.out.println(town); } }
/** * Creates the vocabulary related to the given category. This vocabulary is composed by the terms * that appears in the category articles and its frequency. * * @param category The category. * @return The vocabulary related to the category. * @throws WikiApiException */ public DomainVocabulary createCategoryVocabulary(Category category) throws WikiApiException { CHK.CHECK_NOT_NULL(category); Locale language = new Locale(wiki.getLanguage().name()); DomainVocabulary vocabulary = new DomainVocabulary(language); HashSet<Page> pages = null; pages = (HashSet<Page>) category.getArticles(); for (Page page : pages) { String text = wiki.getParsedArticle(page.getPageId()).getText(); vocabulary.addTerms(text); } return vocabulary; }
/** * Checks if a category belongs to the domain defined by the given vocabulary. A category belongs * to one domain if its title contains at least one word which is included in the domain * vocabulary. * * @param category The category to check. * @param vocabulary The vocabulary of the desired domain. * @return {@code true} if the category belongs to the domain. Otherwise, {@code false}. * @throws WikiTitleParsingException */ public boolean isDomain(Category category, DomainVocabulary vocabulary) throws WikiTitleParsingException { CHK.CHECK_NOT_NULL(category); boolean isDomain = false; String title = category.getTitle().getPlainTitle(); Iterator<String> iterator = vocabulary.preprocess(title).iterator(); while (iterator.hasNext() && !isDomain) { String term = iterator.next(); isDomain = vocabulary.contains(term); } return isDomain; }
public static void main(String args[]) throws IOException { String VOC_OUT_FILE = "%s_%s_%s.vocabulary"; CommandLine cLine = parseArguments(args); Locale language = new Locale(cLine.getOptionValue("l")); int year = Integer.valueOf(cLine.getOptionValue("y")); CategoryExplorer explorer = new CategoryExplorer(language, year); Category cat = null; DomainVocabulary vocabulary = new DomainVocabulary(explorer.getLocale()); ArrayList<GroupOfCategories> groupsList = null; int step = 1; if (cLine.hasOption("c") && cLine.hasOption("t") && cLine.hasOption("s") && cLine.hasOption("v")) { step = 2; String category = cLine.getOptionValue("c"); cat = explorer.loadCategory(Integer.parseInt(category)); File input = new File(cLine.getOptionValue("v")); vocabulary.insertFromFile(input); } if (cLine.hasOption("k") && cLine.hasOption("s")) { step = 3; // TODO init groupsList with k file } switch (step) { case 1: // Step 1: Create domain vocabulary logger.info("Step 1."); String category = cLine.getOptionValue("c"); cat = explorer.loadCategory(Integer.parseInt(category)); if (cat == null) { // TODO } try { vocabulary = explorer.createCategoryVocabulary(cat); } catch (WikiApiException e) { logger.errorEnd("Exception during Wikipedia DB access:" + e.getLocalizedMessage()); e.printStackTrace(); } String filename = String.format(VOC_OUT_FILE, category, language, year); File vocabularyFile = new File(System.getProperty("user.dir"), filename); vocabulary.toFile(vocabularyFile); case 2: // Step 2: Explore Wikipedia categories with the most frequent terms // of the vocabulary logger.info("Step 2."); String top = cLine.getOptionValue("t"); vocabulary = vocabulary.getTop(Float.parseFloat(top)); groupsList = explorer.scoreDomain(vocabulary, cat); // TODO Save groups case 3: // Extract the categories which belong to the domain logger.info("Step 3."); String score = cLine.getOptionValue("s"); HashSet<Category> domain = (HashSet<Category>) explorer.getDomainCategories(groupsList, Double.parseDouble(score)); for (Category domainCat : domain) { try { System.out.println( domainCat.getPageId() + "\t" + domainCat.getTitle().getWikiStyleTitle()); } catch (WikiTitleParsingException e) { logger.error("The category has no title assigned" + e.getLocalizedMessage()); e.printStackTrace(); } } } logger.info("End of the process."); }