예제 #1
0
  /**
   * Scores the groups which compose a domain and are at most as far as the {@code maxDistance}
   * parameter defines. The score of each group is calculated using {@link
   * GroupOfCategories#getScore()}. A group of categories is defined in this case as the set of
   * categories which are at the same distance from a root category.
   *
   * @param vocabulary The vocabulary of the domain.
   * @param root The category which defines the domain.
   * @param maxDistance Maximum distance from the root category where is allowed tro explore.
   * @return A list with all the groups of categories which can be accessed from the given root and
   *     are at most at {@code maxDistance} from the root category . All the groups has been scored
   *     and are indexed by the distance from the root category. It means, the position 0 of the
   *     list only contains the root category because it is the unique category at distance 0 from
   *     itself; the position {@code i} contains all the categories at distance {@code i} from the
   *     root category.
   */
  public ArrayList<GroupOfCategories> scoreDomain(
      DomainVocabulary vocabulary, Category root, int maxDistance) {

    HashSet<Integer> visitedCategories = new HashSet<Integer>();
    ArrayList<GroupOfCategories> scores = new ArrayList<GroupOfCategories>();
    LinkedBlockingQueue<ScoredCategory> pendingCategories =
        new LinkedBlockingQueue<ScoredCategory>();

    // Initialize structures with root category
    GroupOfCategories goc = new GroupOfCategories(root);
    ScoredCategory scat = GroupOfCategories.createScoredCategory(root, null, 0, true);
    goc.addScoredCategory(scat);
    scores.add(goc);

    fillPendingQueue(pendingCategories, scat, maxDistance);
    visitedCategories.add(root.getPageId());

    // Explore the queued categories
    while (!pendingCategories.isEmpty()) {
      scat = pendingCategories.poll();
      Category currentCat = scat.getCategory();
      if (!visitedCategories.contains(currentCat.getPageId())) {
        boolean domain;
        try {
          domain = isDomain(currentCat, vocabulary);
        } catch (WikiTitleParsingException e) {
          domain = false;
        }
        scat.setDomain(domain);
        int currentDistance = scat.getDistance();
        if (currentDistance < scores.size()) {
          scores.get(currentDistance).addScoredCategory(scat);
        } else {
          goc = new GroupOfCategories(root);
          goc.addScoredCategory(scat);
          scores.add(goc);
        }
        fillPendingQueue(pendingCategories, scat, maxDistance);
        visitedCategories.add(root.getPageId());

      } else {
        continue;
      }
    }
    return scores;
  }
예제 #2
0
  public static void main(String args[]) throws IOException {
    String VOC_OUT_FILE = "%s_%s_%s.vocabulary";
    CommandLine cLine = parseArguments(args);
    Locale language = new Locale(cLine.getOptionValue("l"));
    int year = Integer.valueOf(cLine.getOptionValue("y"));
    CategoryExplorer explorer = new CategoryExplorer(language, year);
    Category cat = null;
    DomainVocabulary vocabulary = new DomainVocabulary(explorer.getLocale());
    ArrayList<GroupOfCategories> groupsList = null;
    int step = 1;
    if (cLine.hasOption("c")
        && cLine.hasOption("t")
        && cLine.hasOption("s")
        && cLine.hasOption("v")) {
      step = 2;
      String category = cLine.getOptionValue("c");
      cat = explorer.loadCategory(Integer.parseInt(category));
      File input = new File(cLine.getOptionValue("v"));
      vocabulary.insertFromFile(input);
    }
    if (cLine.hasOption("k") && cLine.hasOption("s")) {
      step = 3;
      // TODO init groupsList with k file
    }

    switch (step) {
      case 1:
        // Step 1: Create domain vocabulary
        logger.info("Step 1.");
        String category = cLine.getOptionValue("c");
        cat = explorer.loadCategory(Integer.parseInt(category));
        if (cat == null) {
          // TODO
        }
        try {
          vocabulary = explorer.createCategoryVocabulary(cat);
        } catch (WikiApiException e) {
          logger.errorEnd("Exception during Wikipedia DB access:" + e.getLocalizedMessage());
          e.printStackTrace();
        }
        String filename = String.format(VOC_OUT_FILE, category, language, year);
        File vocabularyFile = new File(System.getProperty("user.dir"), filename);
        vocabulary.toFile(vocabularyFile);
      case 2:
        // Step 2: Explore Wikipedia categories with the most frequent terms
        // of the vocabulary
        logger.info("Step 2.");
        String top = cLine.getOptionValue("t");
        vocabulary = vocabulary.getTop(Float.parseFloat(top));
        groupsList = explorer.scoreDomain(vocabulary, cat);
        // TODO Save groups
      case 3:
        // Extract the categories which belong to the domain
        logger.info("Step 3.");
        String score = cLine.getOptionValue("s");
        HashSet<Category> domain =
            (HashSet<Category>) explorer.getDomainCategories(groupsList, Double.parseDouble(score));
        for (Category domainCat : domain) {
          try {
            System.out.println(
                domainCat.getPageId() + "\t" + domainCat.getTitle().getWikiStyleTitle());
          } catch (WikiTitleParsingException e) {
            logger.error("The category has no title assigned" + e.getLocalizedMessage());
            e.printStackTrace();
          }
        }
    }
    logger.info("End of the process.");
  }