/** * Scores the groups which compose a domain and are at most as far as the {@code maxDistance} * parameter defines. The score of each group is calculated using {@link * GroupOfCategories#getScore()}. A group of categories is defined in this case as the set of * categories which are at the same distance from a root category. * * @param vocabulary The vocabulary of the domain. * @param root The category which defines the domain. * @param maxDistance Maximum distance from the root category where is allowed tro explore. * @return A list with all the groups of categories which can be accessed from the given root and * are at most at {@code maxDistance} from the root category . All the groups has been scored * and are indexed by the distance from the root category. It means, the position 0 of the * list only contains the root category because it is the unique category at distance 0 from * itself; the position {@code i} contains all the categories at distance {@code i} from the * root category. */ public ArrayList<GroupOfCategories> scoreDomain( DomainVocabulary vocabulary, Category root, int maxDistance) { HashSet<Integer> visitedCategories = new HashSet<Integer>(); ArrayList<GroupOfCategories> scores = new ArrayList<GroupOfCategories>(); LinkedBlockingQueue<ScoredCategory> pendingCategories = new LinkedBlockingQueue<ScoredCategory>(); // Initialize structures with root category GroupOfCategories goc = new GroupOfCategories(root); ScoredCategory scat = GroupOfCategories.createScoredCategory(root, null, 0, true); goc.addScoredCategory(scat); scores.add(goc); fillPendingQueue(pendingCategories, scat, maxDistance); visitedCategories.add(root.getPageId()); // Explore the queued categories while (!pendingCategories.isEmpty()) { scat = pendingCategories.poll(); Category currentCat = scat.getCategory(); if (!visitedCategories.contains(currentCat.getPageId())) { boolean domain; try { domain = isDomain(currentCat, vocabulary); } catch (WikiTitleParsingException e) { domain = false; } scat.setDomain(domain); int currentDistance = scat.getDistance(); if (currentDistance < scores.size()) { scores.get(currentDistance).addScoredCategory(scat); } else { goc = new GroupOfCategories(root); goc.addScoredCategory(scat); scores.add(goc); } fillPendingQueue(pendingCategories, scat, maxDistance); visitedCategories.add(root.getPageId()); } else { continue; } } return scores; }
public static void main(String args[]) throws IOException { String VOC_OUT_FILE = "%s_%s_%s.vocabulary"; CommandLine cLine = parseArguments(args); Locale language = new Locale(cLine.getOptionValue("l")); int year = Integer.valueOf(cLine.getOptionValue("y")); CategoryExplorer explorer = new CategoryExplorer(language, year); Category cat = null; DomainVocabulary vocabulary = new DomainVocabulary(explorer.getLocale()); ArrayList<GroupOfCategories> groupsList = null; int step = 1; if (cLine.hasOption("c") && cLine.hasOption("t") && cLine.hasOption("s") && cLine.hasOption("v")) { step = 2; String category = cLine.getOptionValue("c"); cat = explorer.loadCategory(Integer.parseInt(category)); File input = new File(cLine.getOptionValue("v")); vocabulary.insertFromFile(input); } if (cLine.hasOption("k") && cLine.hasOption("s")) { step = 3; // TODO init groupsList with k file } switch (step) { case 1: // Step 1: Create domain vocabulary logger.info("Step 1."); String category = cLine.getOptionValue("c"); cat = explorer.loadCategory(Integer.parseInt(category)); if (cat == null) { // TODO } try { vocabulary = explorer.createCategoryVocabulary(cat); } catch (WikiApiException e) { logger.errorEnd("Exception during Wikipedia DB access:" + e.getLocalizedMessage()); e.printStackTrace(); } String filename = String.format(VOC_OUT_FILE, category, language, year); File vocabularyFile = new File(System.getProperty("user.dir"), filename); vocabulary.toFile(vocabularyFile); case 2: // Step 2: Explore Wikipedia categories with the most frequent terms // of the vocabulary logger.info("Step 2."); String top = cLine.getOptionValue("t"); vocabulary = vocabulary.getTop(Float.parseFloat(top)); groupsList = explorer.scoreDomain(vocabulary, cat); // TODO Save groups case 3: // Extract the categories which belong to the domain logger.info("Step 3."); String score = cLine.getOptionValue("s"); HashSet<Category> domain = (HashSet<Category>) explorer.getDomainCategories(groupsList, Double.parseDouble(score)); for (Category domainCat : domain) { try { System.out.println( domainCat.getPageId() + "\t" + domainCat.getTitle().getWikiStyleTitle()); } catch (WikiTitleParsingException e) { logger.error("The category has no title assigned" + e.getLocalizedMessage()); e.printStackTrace(); } } } logger.info("End of the process."); }