private void fillPendingQueue(Queue<ScoredCategory> queue, ScoredCategory scat, int maxDistance) {
   Category cat = scat.getCategory();
   int newDistance = scat.getDistance() + 1;
   if (newDistance <= maxDistance) {
     for (Category subCat : cat.getChildren()) {
       ScoredCategory newSCat = GroupOfCategories.createScoredCategory(subCat, cat, newDistance);
       queue.add(newSCat);
     }
   }
 }
  /**
   * Scores the groups which compose a domain and are at most as far as the {@code maxDistance}
   * parameter defines. The score of each group is calculated using {@link
   * GroupOfCategories#getScore()}. A group of categories is defined in this case as the set of
   * categories which are at the same distance from a root category.
   *
   * @param vocabulary The vocabulary of the domain.
   * @param root The category which defines the domain.
   * @param maxDistance Maximum distance from the root category where is allowed tro explore.
   * @return A list with all the groups of categories which can be accessed from the given root and
   *     are at most at {@code maxDistance} from the root category . All the groups has been scored
   *     and are indexed by the distance from the root category. It means, the position 0 of the
   *     list only contains the root category because it is the unique category at distance 0 from
   *     itself; the position {@code i} contains all the categories at distance {@code i} from the
   *     root category.
   */
  public ArrayList<GroupOfCategories> scoreDomain(
      DomainVocabulary vocabulary, Category root, int maxDistance) {

    HashSet<Integer> visitedCategories = new HashSet<Integer>();
    ArrayList<GroupOfCategories> scores = new ArrayList<GroupOfCategories>();
    LinkedBlockingQueue<ScoredCategory> pendingCategories =
        new LinkedBlockingQueue<ScoredCategory>();

    // Initialize structures with root category
    GroupOfCategories goc = new GroupOfCategories(root);
    ScoredCategory scat = GroupOfCategories.createScoredCategory(root, null, 0, true);
    goc.addScoredCategory(scat);
    scores.add(goc);

    fillPendingQueue(pendingCategories, scat, maxDistance);
    visitedCategories.add(root.getPageId());

    // Explore the queued categories
    while (!pendingCategories.isEmpty()) {
      scat = pendingCategories.poll();
      Category currentCat = scat.getCategory();
      if (!visitedCategories.contains(currentCat.getPageId())) {
        boolean domain;
        try {
          domain = isDomain(currentCat, vocabulary);
        } catch (WikiTitleParsingException e) {
          domain = false;
        }
        scat.setDomain(domain);
        int currentDistance = scat.getDistance();
        if (currentDistance < scores.size()) {
          scores.get(currentDistance).addScoredCategory(scat);
        } else {
          goc = new GroupOfCategories(root);
          goc.addScoredCategory(scat);
          scores.add(goc);
        }
        fillPendingQueue(pendingCategories, scat, maxDistance);
        visitedCategories.add(root.getPageId());

      } else {
        continue;
      }
    }
    return scores;
  }
示例#3
0
  public static void main(String[] args) throws WikiApiException {

    // configure the database connection parameters
    DatabaseConfiguration dbConfig = new DatabaseConfiguration();
    dbConfig.setHost("SERVER_URL");
    dbConfig.setDatabase("DATABASE");
    dbConfig.setUser("USER");
    dbConfig.setPassword("PASSWORD");
    dbConfig.setLanguage(Language.german);

    // Create a new German wikipedia.
    Wikipedia wiki = new Wikipedia(dbConfig);

    // Get the category "Towns in Germany"
    String title = "Towns in Germany";
    Category topCat;
    try {
      topCat = wiki.getCategory(title);
    } catch (WikiPageNotFoundException e) {
      throw new WikiApiException("Category " + title + " does not exist");
    }

    // Add the pages categorized under "Towns in Germany".
    Set<String> towns = new TreeSet<String>();
    for (Page p : topCat.getArticles()) {
      towns.add(p.getTitle().getPlainTitle());
    }

    // Get the pages categorized under each subcategory of "Towns in Germany".
    for (Category townCategory : topCat.getDescendants()) {
      for (Page p : townCategory.getArticles()) {
        towns.add(p.getTitle().getPlainTitle());
      }
      System.out.println("Number of towns: " + towns.size());
    }

    // Output the pages
    for (String town : towns) {
      System.out.println(town);
    }
  }
  /**
   * Creates the vocabulary related to the given category. This vocabulary is composed by the terms
   * that appears in the category articles and its frequency.
   *
   * @param category The category.
   * @return The vocabulary related to the category.
   * @throws WikiApiException
   */
  public DomainVocabulary createCategoryVocabulary(Category category) throws WikiApiException {
    CHK.CHECK_NOT_NULL(category);
    Locale language = new Locale(wiki.getLanguage().name());
    DomainVocabulary vocabulary = new DomainVocabulary(language);
    HashSet<Page> pages = null;
    pages = (HashSet<Page>) category.getArticles();
    for (Page page : pages) {
      String text = wiki.getParsedArticle(page.getPageId()).getText();
      vocabulary.addTerms(text);
    }

    return vocabulary;
  }
  /**
   * Checks if a category belongs to the domain defined by the given vocabulary. A category belongs
   * to one domain if its title contains at least one word which is included in the domain
   * vocabulary.
   *
   * @param category The category to check.
   * @param vocabulary The vocabulary of the desired domain.
   * @return {@code true} if the category belongs to the domain. Otherwise, {@code false}.
   * @throws WikiTitleParsingException
   */
  public boolean isDomain(Category category, DomainVocabulary vocabulary)
      throws WikiTitleParsingException {
    CHK.CHECK_NOT_NULL(category);
    boolean isDomain = false;
    String title = category.getTitle().getPlainTitle();
    Iterator<String> iterator = vocabulary.preprocess(title).iterator();
    while (iterator.hasNext() && !isDomain) {
      String term = iterator.next();
      isDomain = vocabulary.contains(term);
    }

    return isDomain;
  }
  public static void main(String args[]) throws IOException {
    String VOC_OUT_FILE = "%s_%s_%s.vocabulary";
    CommandLine cLine = parseArguments(args);
    Locale language = new Locale(cLine.getOptionValue("l"));
    int year = Integer.valueOf(cLine.getOptionValue("y"));
    CategoryExplorer explorer = new CategoryExplorer(language, year);
    Category cat = null;
    DomainVocabulary vocabulary = new DomainVocabulary(explorer.getLocale());
    ArrayList<GroupOfCategories> groupsList = null;
    int step = 1;
    if (cLine.hasOption("c")
        && cLine.hasOption("t")
        && cLine.hasOption("s")
        && cLine.hasOption("v")) {
      step = 2;
      String category = cLine.getOptionValue("c");
      cat = explorer.loadCategory(Integer.parseInt(category));
      File input = new File(cLine.getOptionValue("v"));
      vocabulary.insertFromFile(input);
    }
    if (cLine.hasOption("k") && cLine.hasOption("s")) {
      step = 3;
      // TODO init groupsList with k file
    }

    switch (step) {
      case 1:
        // Step 1: Create domain vocabulary
        logger.info("Step 1.");
        String category = cLine.getOptionValue("c");
        cat = explorer.loadCategory(Integer.parseInt(category));
        if (cat == null) {
          // TODO
        }
        try {
          vocabulary = explorer.createCategoryVocabulary(cat);
        } catch (WikiApiException e) {
          logger.errorEnd("Exception during Wikipedia DB access:" + e.getLocalizedMessage());
          e.printStackTrace();
        }
        String filename = String.format(VOC_OUT_FILE, category, language, year);
        File vocabularyFile = new File(System.getProperty("user.dir"), filename);
        vocabulary.toFile(vocabularyFile);
      case 2:
        // Step 2: Explore Wikipedia categories with the most frequent terms
        // of the vocabulary
        logger.info("Step 2.");
        String top = cLine.getOptionValue("t");
        vocabulary = vocabulary.getTop(Float.parseFloat(top));
        groupsList = explorer.scoreDomain(vocabulary, cat);
        // TODO Save groups
      case 3:
        // Extract the categories which belong to the domain
        logger.info("Step 3.");
        String score = cLine.getOptionValue("s");
        HashSet<Category> domain =
            (HashSet<Category>) explorer.getDomainCategories(groupsList, Double.parseDouble(score));
        for (Category domainCat : domain) {
          try {
            System.out.println(
                domainCat.getPageId() + "\t" + domainCat.getTitle().getWikiStyleTitle());
          } catch (WikiTitleParsingException e) {
            logger.error("The category has no title assigned" + e.getLocalizedMessage());
            e.printStackTrace();
          }
        }
    }
    logger.info("End of the process.");
  }